diff --git a/clearcode/cdutils.py b/clearcode/cdutils.py index 2dfd9824..73d979bb 100644 --- a/clearcode/cdutils.py +++ b/clearcode/cdutils.py @@ -34,7 +34,7 @@ from packageurl import PackageURL """ -ClearlyDefined utlities. +ClearlyDefined utilities. """ TRACE_FETCH = False @@ -532,7 +532,7 @@ def str2coord(s): segments = s.strip(splitter).split(splitter) if is_urn or is_url: segments = segments[1:] - # ignore extra segments for now beyond the 5 fisrt (such as the PR of a curation) + # ignore extra segments for now beyond the 5 first (such as the PR of a curation) segments = segments[:5] fields = ( diff --git a/clearcode/store_scans.py b/clearcode/store_scans.py index 08132899..8ca977a7 100644 --- a/clearcode/store_scans.py +++ b/clearcode/store_scans.py @@ -34,8 +34,8 @@ """ The input is a bunch of scans from ClearlyDefined and -the output is a bunch of git repositories with commited and -pushed scans such that we balance the scans roughly evenly accross +the output is a bunch of git repositories with committed and +pushed scans such that we balance the scans roughly evenly across different repositories. The primary reason for multiple repositories is size of a single @@ -127,7 +127,7 @@ def get_cd_item_by_purl_hash(cd_items): def add_scancode_scan(repo, purl, scancode_scan): """ Save and commit scancode scan for purl to git repo. - Return true if we commited else false + Return true if we committed else false """ purl_data_dir = get_or_create_dir_for_purl(purl=purl, repo=repo) scancode_scan_path = purl_data_dir / "scancode-toolkit-scan.json" diff --git a/clearindex/management/commands/run_clearindex.py b/clearindex/management/commands/run_clearindex.py index 6d729d58..1c397920 100644 --- a/clearindex/management/commands/run_clearindex.py +++ b/clearindex/management/commands/run_clearindex.py @@ -161,7 +161,7 @@ def handle(self, *args, **options): def map_definition(cditem): """ Map a CD definition. Return the Package created from a mapped CD definition - or None if a Package could not be created or an Exception has occured. + or None if a Package could not be created or an Exception has occurred. """ try: with transaction.atomic(): @@ -328,7 +328,7 @@ def str2coord(s): segments = s.strip(splitter).split(splitter) if is_urn or is_url: segments = segments[1:] - # ignore extra segments for now beyond the 5 fisrt (such as the PR of a curation) + # ignore extra segments for now beyond the 5 first (such as the PR of a curation) segments = segments[:5] fields = ( diff --git a/docs/source/how-to-guides/deploy_to_devel.rst b/docs/source/how-to-guides/deploy_to_devel.rst index 730153c4..88c89bd4 100644 --- a/docs/source/how-to-guides/deploy_to_devel.rst +++ b/docs/source/how-to-guides/deploy_to_devel.rst @@ -4,7 +4,7 @@ Map deployed code back to source code aka. back2source ======================================================= -In this tutorial we excercise the ScanCode.io pipeline used map the deployed binaries back to the +In this tutorial we exercise the ScanCode.io pipeline used map the deployed binaries back to the assumed source code of a package, or map source archives to the sources from a version control system (VCS) checkout. @@ -59,17 +59,17 @@ Yet these assumption are often proven wrong and the potential for many issues: _ incident where the source archive of the XZ Utils packages had been modified to create a malicious SSH backdoor. These cases need to be detected ideally before the source code is even built. back2source has been detecting the - XZ malicious automake build scripts as requring review, and this using code available before the + XZ malicious automake build scripts as requiring review, and this using code available before the XZ backdoor issue was known. -- Extra code may be provisioned and routinely injected or complied in the final binary without +- Extra code may be provisioned and routinely injected or compiled in the final binary without malice. - For instance, an "UberJAR" is created as a larger Java JAR _ as the combination of multiple JARS. The other JARs are fetched at built time and not present in source code form and commonly without metadata to help track their origin. This means that using - package A, means really using unknowningly A, but also B and C. There are license and security + package A, means really using unknowingly A, but also B and C. There are license and security implications when the license, origin and vulnerability status of B and C goes undetected. Most tools do not detect these extra package inclusions. @@ -102,12 +102,12 @@ The ScanCode.io pipeline supports these technologies: - end-to-end ELF binaries package binary to source analysis. The focus is on on binaries compiled from C (C++ will be implemented separately in the future as it requires additional demangling of - function signatures). This analysis is based extracting DWARF debug symbols compliation unit + function signatures). This analysis is based extracting DWARF debug symbols compilation unit references. -- end-to-end Go binary executable to source analysis o binary to source analysis. Note that Go is - special, as while its targets binaries are compiled to ELF, Macho-O and Windows PE/COFF formats, - depending on the operating system target and can also be anlyzed as an ELF for Linux, a Go +- end-to-end Go binary executable to source analysis or binary to source analysis. Note that Go is + special, as while its targets binaries are compiled to ELF, Mach-O and Windows PE/COFF formats, + depending on the operating system target and can also be analyzed as an ELF for Linux, a Go binary also contains extra information to map source and binaries together through a specific data structure. This pipeline will be using this data structure (aka. the pclntab). @@ -450,6 +450,6 @@ Here is how the project creation looks like: .. image:: images/d2d-images/43a5ff56-fb36-45c7-82bb-8b5256759eee.png -- Inthe resource page, there are also file-level mappings details: +- In the resource page, there are also file-level mappings details: .. image:: images/d2d-images/4acd087e-0cd1-4361-a8ee-f7af7681c74e.png diff --git a/docs/source/how-to-guides/symbols_and_strings.rst b/docs/source/how-to-guides/symbols_and_strings.rst index be401564..30fd3f71 100644 --- a/docs/source/how-to-guides/symbols_and_strings.rst +++ b/docs/source/how-to-guides/symbols_and_strings.rst @@ -76,7 +76,7 @@ Binary analysis ~~~~~~~~~~~~~~~~ Once we have collected symbols and strings from the source code, we can search these in a binary. -The presence of these symbols in the binaries can be used to find the origin of code complied in +The presence of these symbols in the binaries can be used to find the origin of code compiled in binaries with a lightweight "reverse" engineering process. For instance, a tool like BANG _ can use the source symbols to build automatons-based search indexes to support an efficient binary origin analysis. diff --git a/docs/source/purldb/rest_api.rst b/docs/source/purldb/rest_api.rst index 0fe02d27..f34b62bd 100644 --- a/docs/source/purldb/rest_api.rst +++ b/docs/source/purldb/rest_api.rst @@ -133,7 +133,7 @@ An API endpoint that provides the ability to list and get packages. "parties": [ { "type": "person", - "role": "developper", + "role": "developer", "name": "Elastic", "email": null, "url": "https://www.elastic.co" @@ -277,7 +277,7 @@ The package details view returns all information available about a package. "parties": [ { "type": "person", - "role": "developper", + "role": "developer", "name": "Elastic", "email": null, "url": "https://www.elastic.co" @@ -468,7 +468,7 @@ Using cURL to get enhanced package data: "parties": [ { "type": "person", - "role": "developper", + "role": "developer", "name": "Elastic", "email": null, "url": "https://www.elastic.co" @@ -550,7 +550,7 @@ Using cURL to reindex a package: Filter by checksum ~~~~~~~~~~~~~~~~~~ -Take a mapping, where the keys are the names of the checksum algorthm and the +Take a mapping, where the keys are the names of the checksum algorithm and the values is a list of checksum values and query those values against the packagedb. @@ -666,7 +666,7 @@ One action is available on resources: Filter by checksum ~~~~~~~~~~~~~~~~~~ -Take a mapping, where the keys are the names of the checksum algorthm and the +Take a mapping, where the keys are the names of the checksum algorithm and the values is a list of checksum values and query those values against the packagedb. diff --git a/docs/source/purldb/symbol_and_string_collection.rst b/docs/source/purldb/symbol_and_string_collection.rst index c4579f72..e61a512b 100644 --- a/docs/source/purldb/symbol_and_string_collection.rst +++ b/docs/source/purldb/symbol_and_string_collection.rst @@ -11,9 +11,9 @@ pipeline and stores them in the ``extra_data`` field of the resource model. What are symbols? ------------------ -Source code symbols are the names of the functions, methods, classes, varibales and data structures -as found in source code. Another name is "identifiers". Source code iterals (or "strings") are the -string values of variables, such as messages asssigned to a variable or constant in the source code +Source code symbols are the names of the functions, methods, classes, variables and data structures +as found in source code. Another name is "identifiers". Source code literals (or "strings") are the +string values of variables, such as messages assigned to a variable or constant in the source code of a program. Why would you want to collect source symbols? diff --git a/etc/ci/macports-ci b/etc/ci/macports-ci index ac474e4e..b4351ef1 100644 --- a/etc/ci/macports-ci +++ b/etc/ci/macports-ci @@ -190,7 +190,7 @@ do # this check confirms that ports were installed # notice that port -N selfupdate && break is not sufficient as a test # (sometime it returns a success even though ports have not been installed) -# for some misterious reasons, running without "-d" does not work in some case +# for some mysterious reasons, running without "-d" does not work in some case sudo port -d -N selfupdate 2>&1 | grep -v DEBUG | awk '{if($1!="x")print}' port info xdrfile > /dev/null && break || true sleep 5 diff --git a/etc/scripts/fetch_thirdparty.py b/etc/scripts/fetch_thirdparty.py index 76a19a60..ac9f9379 100644 --- a/etc/scripts/fetch_thirdparty.py +++ b/etc/scripts/fetch_thirdparty.py @@ -166,7 +166,7 @@ def fetch_thirdparty( Download the PyPI packages listed in the combination of: - the pip requirements --requirements REQUIREMENT-FILE(s), - the pip name==version --specifier SPECIFIER(s) - - any pre-existing wheels or sdsists found in --dest-dir THIRDPARTY_DIR. + - any pre-existing wheels or sdists found in --dest-dir THIRDPARTY_DIR. Download wheels with the --wheels option for the ``--python-version`` PYVER(s) and ``--operating_system`` OS(s) combinations defaulting to all diff --git a/etc/scripts/gen_pypi_simple.py b/etc/scripts/gen_pypi_simple.py index 89d06265..a85a15f6 100644 --- a/etc/scripts/gen_pypi_simple.py +++ b/etc/scripts/gen_pypi_simple.py @@ -177,7 +177,7 @@ def simple_index_entry(self, base_url): def build_pypi_index(directory, base_url="https://thirdparty.aboutcode.org/pypi"): """ Create the a PyPI simple directory index using a ``directory`` directory of wheels and sdists in - the direvctory at ``directory``/simple/ populated with the proper PyPI simple index directory + the directory at ``directory``/simple/ populated with the proper PyPI simple index directory structure crafted using symlinks. WARNING: The ``directory``/simple/ directory is removed if it exists. NOTE: in addition to the a diff --git a/etc/scripts/gen_requirements.py b/etc/scripts/gen_requirements.py index 1b879442..626b7011 100644 --- a/etc/scripts/gen_requirements.py +++ b/etc/scripts/gen_requirements.py @@ -15,7 +15,7 @@ """ Utilities to manage requirements files. NOTE: this should use ONLY the standard library and not import anything else -because this is used for boostrapping with no requirements installed. +because this is used for bootstrapping with no requirements installed. """ diff --git a/etc/scripts/gen_requirements_dev.py b/etc/scripts/gen_requirements_dev.py index 85482056..c005e574 100644 --- a/etc/scripts/gen_requirements_dev.py +++ b/etc/scripts/gen_requirements_dev.py @@ -15,7 +15,7 @@ """ Utilities to manage requirements files. NOTE: this should use ONLY the standard library and not import anything else -because this is used for boostrapping with no requirements installed. +because this is used for bootstrapping with no requirements installed. """ diff --git a/etc/scripts/utils_dejacode.py b/etc/scripts/utils_dejacode.py index b6bff518..417e0a3f 100644 --- a/etc/scripts/utils_dejacode.py +++ b/etc/scripts/utils_dejacode.py @@ -86,7 +86,7 @@ def update_with_dejacode_data(distribution): def update_with_dejacode_about_data(distribution): """ - Update the Distribution `distribution` wiht ABOUT code data fetched from + Update the Distribution `distribution` with ABOUT code data fetched from DejaCode. Return True if data was updated. """ package_data = get_package_data(distribution) diff --git a/etc/scripts/utils_requirements.py b/etc/scripts/utils_requirements.py index 8faa25e4..ae4ea8cd 100644 --- a/etc/scripts/utils_requirements.py +++ b/etc/scripts/utils_requirements.py @@ -14,7 +14,7 @@ """ Utilities to manage requirements files and call pip. NOTE: this should use ONLY the standard library and not import anything else -because this is used for boostrapping with no requirements installed. +because this is used for bootstrapping with no requirements installed. """ diff --git a/etc/scripts/utils_thirdparty.py b/etc/scripts/utils_thirdparty.py index 8e1ba819..6ccae94b 100644 --- a/etc/scripts/utils_thirdparty.py +++ b/etc/scripts/utils_thirdparty.py @@ -68,15 +68,15 @@ was built for and these tags can be matched to an Environment. - An Environment is a combination of a Python version and operating system - (e.g., platfiorm and ABI tags.) and is represented by the "tags" it supports. + (e.g., platform and ABI tags.) and is represented by the "tags" it supports. - A plain LinksRepository which is just a collection of URLs scrape from a web - page such as HTTP diretory listing. It is used either with pip "--find-links" + page such as HTTP directory listing. It is used either with pip "--find-links" option or to fetch ABOUT and LICENSE files. - A PypiSimpleRepository is a PyPI "simple" index where a HTML page is listing package name links. Each such link points to an HTML page listing URLs to all - wheels and sdsist of all versions of this package. + wheels and sdist of all versions of this package. PypiSimpleRepository and Packages are related through packages name, version and filenames. @@ -265,7 +265,7 @@ def download_wheel(name, version, environment, dest_dir=THIRDPARTY_DIR, repos=tu fetched_wheel_filenames.append(fetched_wheel_filename) if fetched_wheel_filenames: - # do not futher fetch from other repos if we find in first, typically PyPI + # do not further fetch from other repos if we find in first, typically PyPI break return fetched_wheel_filenames @@ -305,7 +305,7 @@ def download_sdist(name, version, dest_dir=THIRDPARTY_DIR, repos=tuple()): fetched_sdist_filename = package.sdist.download(dest_dir=dest_dir) if fetched_sdist_filename: - # do not futher fetch from other repos if we find in first, typically PyPI + # do not further fetch from other repos if we find in first, typically PyPI break return fetched_sdist_filename @@ -1648,7 +1648,7 @@ def _get_package_versions_map(self, name): self.fetched_package_normalized_names.add(normalized_name) try: links = self.fetch_links(normalized_name=normalized_name) - # note that thsi is sorted so the mapping is also sorted + # note that this is sorted so the mapping is also sorted versions = { package.version: package for package in PypiPackage.packages_from_many_paths_or_urls(paths_or_urls=links) diff --git a/matchcode/match.py b/matchcode/match.py index 4fb9e4ce..0bec7321 100644 --- a/matchcode/match.py +++ b/matchcode/match.py @@ -173,7 +173,7 @@ def individual_file_match(codebase): def approximate_file_match(codebase): """ - Update Matches from approximatly matched Package files in `codebase`. + Update Matches from approximately matched Package files in `codebase`. Return the number of approximate matches found in `codebase`. """ diff --git a/matchcode/models.py b/matchcode/models.py index ff9a84dc..bda967cb 100644 --- a/matchcode/models.py +++ b/matchcode/models.py @@ -496,7 +496,7 @@ def match(cls, fingerprints): # strip positions only_fings = [hexstring_to_binarray(fing["snippet"]) for fing in fingerprints] - # Step 0: get all fingerprint records that match whith the input + # Step 0: get all fingerprint records that match with the input matched_fps = cls.objects.filter(fingerprint__in=only_fings) # Step 1: count Packages whose fingerprints appear @@ -559,7 +559,7 @@ def match_resources(cls, fingerprints, top=None, **kwargs): # TODO: track matched package and package resource in ExtendedFileFragmentMatch - # Step 0: get all fingerprint records that match whith the input + # Step 0: get all fingerprint records that match with the input matched_fps = cls.objects.filter(fingerprint__in=only_fings) # Step 1: get Resources that show up in the query diff --git a/matchcode/utils.py b/matchcode/utils.py index 48a1df84..a59cb9b6 100644 --- a/matchcode/utils.py +++ b/matchcode/utils.py @@ -200,7 +200,7 @@ def index_resource_fingerprints(codebase, package): Return a tuple of integers, `indexed_adci`, `indexed_adsi`, and `indexed_arci` that represent the number of indexed ApproximateDirectoryContentIndex, ApproximateDirectoryStructureIndex, and - ApproximateResourceContentIndex created, respectivly. + ApproximateResourceContentIndex created, respectively. """ from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex @@ -252,7 +252,7 @@ def index_package_directories(package): Return a tuple of integers, `indexed_adci`, `indexed_adsi`, and `indexed_arci` that represent the number of indexed ApproximateDirectoryContentIndex, ApproximateDirectoryStructureIndex, and - ApproximateResourceContentIndex created, respectivly. + ApproximateResourceContentIndex created, respectively. Return 0, 0, 0 if a VirtualCodebase cannot be created from the Resources of a Package. diff --git a/matchcode_pipeline/pipelines/matching.py b/matchcode_pipeline/pipelines/matching.py index fc538e50..6cdd857b 100644 --- a/matchcode_pipeline/pipelines/matching.py +++ b/matchcode_pipeline/pipelines/matching.py @@ -33,7 +33,7 @@ class Matching(ScanCodebase, LoadInventory): 2. Match archive to Resources 3. Match directory exactly 4. Match files exactly - 5. Match directories approximatly + 5. Match directories approximately 6. Match files approximately 7. Matching on similar file attributes (path, type, extension, size, Java classpath, etc.) 8. Return only the best matches (We could inject some user input, policies, we could provide a list of purls to guide matching, ) diff --git a/matchcode_pipeline/pipes/matching.py b/matchcode_pipeline/pipes/matching.py index 1e56f0b6..4f9f3fa7 100644 --- a/matchcode_pipeline/pipes/matching.py +++ b/matchcode_pipeline/pipes/matching.py @@ -61,7 +61,7 @@ def get_project_resources_qs(project, resources): # intend to. For example, if we have matched on the directory with # the path `foo/bar/1`, using the __startswith filter without # including a trailing slash on the path would have us get all - # diretories under `foo/bar/` that start with 1, such as + # directories under `foo/bar/` that start with 1, such as # `foo/bar/10001`, `foo/bar/123`, etc., when we just want `foo/bar/1` # and its descendants. path = f"{resource.path}/" @@ -243,7 +243,7 @@ def match_purldb_directory(project, resource, exact_match=False): def match_sha1s_to_purldb(project, resources_by_sha1, matcher_func, package_data_by_purldb_urls): """ Process `resources_by_sha1` with `matcher_func` and return a 3-tuple - contaning an empty defaultdict(list), the number of matches and the number + containing an empty defaultdict(list), the number of matches and the number of sha1s sent to purldb. """ matched_count = matcher_func( diff --git a/minecode/collectors/debian.py b/minecode/collectors/debian.py index c8e8ea85..22f857f4 100644 --- a/minecode/collectors/debian.py +++ b/minecode/collectors/debian.py @@ -56,7 +56,7 @@ def process_request(purl_str, **kwargs): source_package_url = PackageURL.from_string(source_purl) except ValueError as e: - error = f"error occured when parsing purl: {purl_str} source_purl: {source_purl} : {e}" + error = f"error occurred when parsing purl: {purl_str} source_purl: {source_purl} : {e}" return error has_version = bool(package_url.version) @@ -75,7 +75,7 @@ def map_debian_package(debian_package, package_content, pipelines, priority=0): """ Add a debian `package_url` to the PackageDB. - Return an error string if errors have occured in the process. + Return an error string if errors have occurred in the process. """ from minecode.model_utils import add_package_to_scan_queue from minecode.model_utils import merge_or_create_package diff --git a/minecode/collectors/generic.py b/minecode/collectors/generic.py index 09cbf395..1cc98a27 100644 --- a/minecode/collectors/generic.py +++ b/minecode/collectors/generic.py @@ -76,7 +76,7 @@ def process_request(purl_str, **kwargs): try: package_url = PackageURL.from_string(purl_str) except ValueError as e: - error = f"error occured when parsing {purl_str}: {e}" + error = f"error occurred when parsing {purl_str}: {e}" return error download_url = package_url.qualifiers.get("download_url") diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index 8f0fb4d3..19ba6069 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -97,7 +97,7 @@ def get_packages(self, content=None): # build a URL: This is the real JAR download URL # FIXME: this should be set at the time of creating Artifacts - # instead togther with the filename... especially we could use + # instead together with the filename... especially we could use # different REPOs. jar_download_url, _ = build_url_and_filename( group_id, artifact_id, version, extension, classifier @@ -262,7 +262,7 @@ def map_maven_package(package_url, package_content, pipelines, priority=0, reind """ Add a maven `package_url` to the PackageDB. - Return an error string if errors have occured in the process. + Return an error string if errors have occurred in the process. if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package. """ @@ -457,7 +457,7 @@ def process_request(purl_str, **kwargs): try: package_url = PackageURL.from_string(purl_str) except ValueError as e: - error = f"error occured when parsing {purl_str}: {e}" + error = f"error occurred when parsing {purl_str}: {e}" return error has_version = bool(package_url.version) diff --git a/minecode/management/commands/check_uri.py b/minecode/management/commands/check_uri.py index 9c91c493..620f7ba3 100644 --- a/minecode/management/commands/check_uri.py +++ b/minecode/management/commands/check_uri.py @@ -67,16 +67,16 @@ def handle(self, *args, **options): # TODO: add if the uri be resolved by visit and/or map router for uri in uris: try: - # FIXME: resolve() returns an acutal Visitor object, using module names for now + # FIXME: resolve() returns an actual Visitor object, using module names for now visit_route_resolve = repr(visit_router.resolve(uri.uri)) except NoRouteAvailable: - visit_route_resolve = "No Route Availible" + visit_route_resolve = "No Route Available" try: - # FIXME: resolve() returns an acutal Mapper object, using module names for now + # FIXME: resolve() returns an actual Mapper object, using module names for now map_route_resolve = repr(map_router.resolve(uri.uri)) except NoRouteAvailable: - map_route_resolve = "No Route Availible" + map_route_resolve = "No Route Available" if uri.last_visit_date: last_visit_date = uri.last_visit_date.isoformat() diff --git a/minecode/management/commands/manage_scans.py b/minecode/management/commands/manage_scans.py index 4693a970..a9c29557 100644 --- a/minecode/management/commands/manage_scans.py +++ b/minecode/management/commands/manage_scans.py @@ -123,7 +123,7 @@ def get_next_uri(self): def process_scan(scannable_uri, **kwargs): """ Process a single `scannable_uri` ScannableURI. Subclasses must implement. - If sucessfully processed the ScannableURI must be updated accordingly. + If successfully processed the ScannableURI must be updated accordingly. """ pass diff --git a/minecode/management/commands/run_map.py b/minecode/management/commands/run_map.py index 7734c806..e527b877 100644 --- a/minecode/management/commands/run_map.py +++ b/minecode/management/commands/run_map.py @@ -174,7 +174,7 @@ def map_uri(resource_uri, _map_router=map_router): resource_uri.last_map_date = timezone.now() resource_uri.wip_date = None # always set the map error, resetting it to empty if the mapping was - # succesful + # successful if map_error: resource_uri.map_error = map_error else: diff --git a/minecode/management/commands/run_visit.py b/minecode/management/commands/run_visit.py index 3c90a3cb..e5482262 100644 --- a/minecode/management/commands/run_visit.py +++ b/minecode/management/commands/run_visit.py @@ -276,7 +276,7 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout # a route is added. return 0 except (ConnectionError, Timeout, Exception) as e: - # FIXME: is catching all expections here correct? + # FIXME: is catching all exceptions here correct? msg = f"Visit error for URI: {uri_to_visit}" msg += "\n".format() msg += get_error_message(e) @@ -341,7 +341,7 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout logger.debug(f" + NOT Inserted:\t{uri_str}") except Exception as e: - # FIXME: is catching all expections here correct? + # FIXME: is catching all exceptions here correct? msg = f"ERROR while processing URI from a visit through: {uri_str}" msg += "\n" msg += repr(visited_uri) diff --git a/minecode/miners/__init__.py b/minecode/miners/__init__.py index 866f6706..29b06ba6 100644 --- a/minecode/miners/__init__.py +++ b/minecode/miners/__init__.py @@ -15,7 +15,7 @@ from minecode.utils import get_temp_file -# FIXME: use attr or use a plain ResourceURI object insteaad +# FIXME: use attr or use a plain ResourceURI object instead @total_ordering class URI: """ @@ -98,7 +98,7 @@ def __init__( def to_dict(self, data_is_json=False): """ - Return an ordered seralization of self. + Return an ordered serialization of self. Treat data as JSON if `data_is_json` is True """ ordered_dict = dict() @@ -173,7 +173,7 @@ def get_uris(self, content): def dumps(self, content): """ - Return the content seralized as a string suitable for storing in a + Return the content serialized as a string suitable for storing in a database text blob. Subclasses should override when they support structured content (such as JSON). """ @@ -181,7 +181,7 @@ def dumps(self, content): def loads(self, content): """ - Return a Python data structure loaded from a content seralized as a + Return a Python data structure loaded from a content serialized as a string either as fetched or loaded from the database. Subclasses should override when they support structured content (such as JSON). """ diff --git a/minecode/miners/apache.py b/minecode/miners/apache.py index 0b956644..bfb0f2eb 100644 --- a/minecode/miners/apache.py +++ b/minecode/miners/apache.py @@ -318,14 +318,14 @@ def get_uris(self, content): class ApacheSingleProjectJsonVisitor(HttpJsonVisitor): """ Collect json content from single project json file. It does not - return any URI as the json contains the project meatadata only, so + return any URI as the json contains the project metadata only, so this visitor is getting the json to pass to mapper. """ pass -# FIXME: what can we do with a homepage and nam, packagedb wise?? +# FIXME: what can we do with a homepage and name, packagedb wise?? # @visit_router.route('https://projects.apache.org/json/foundation/podlings.json') class ApachePodlingsJsonVisitor(HttpJsonVisitor): """ @@ -582,7 +582,7 @@ def get_name_version(uri): if is_all_int: version = segment except ValueError: - # Connect the package_name with - because we split it with - eariler, util + # Connect the package_name with - because we split it with - earlier, util # when we meet version, package_name should be good. if not package_name: package_name = segment diff --git a/minecode/miners/cpan.py b/minecode/miners/cpan.py index 77ec4f5d..5d63353a 100644 --- a/minecode/miners/cpan.py +++ b/minecode/miners/cpan.py @@ -154,9 +154,9 @@ def get_uris(self, content): else: name = url name = name.replace("tar.gz", "").replace(".readme", "").replace(".meta", "") - partions = name.rpartition("-") - name = partions[0] - version = partions[-1] + partitions = name.rpartition("-") + name = partitions[0] + version = partitions[-1] package_url = None if name and version: package_url = PackageURL(type="cpan", name=name, version=version).to_string() @@ -329,7 +329,7 @@ def build_packages_from_metafile(metadata, uri=None, purl=None): uri: the uri of the ResourceURI object purl: String value of the package url of the ResourceURI object """ - # FIXME: it does not make sense to use a single functin tod eal with the two + # FIXME: it does not make sense to use a single function tod eal with the two # formats IMHO if is_json(metadata): content = json.loads(metadata) diff --git a/minecode/miners/eclipse.py b/minecode/miners/eclipse.py index fb2f44d4..6993f05f 100644 --- a/minecode/miners/eclipse.py +++ b/minecode/miners/eclipse.py @@ -267,7 +267,7 @@ def build_packages(html_text, purl=None, uri=None): if "class" not in div.attrs: continue if "field-name-field-project-licenses" in div.attrs.get("class"): - # Visit div element whose class atttribute is field-name-field-project-licenses + # Visit div element whose class attribute is field-name-field-project-licenses for a in div.find_all(name="a"): if "href" not in a.attrs: continue diff --git a/minecode/miners/github.py b/minecode/miners/github.py index bde7e93f..874a4164 100644 --- a/minecode/miners/github.py +++ b/minecode/miners/github.py @@ -90,7 +90,7 @@ class GithubSingleRepoVisitor(HttpJsonVisitor): def fetch(self, uri, timeout=None): """ Having its own fetch function instead of inheriting from HttpJsonVisitor class is because: - The json itself has lots of URL info, the Github API can get content without acccessing the URLs inside the json explicitly. + The json itself has lots of URL info, the Github API can get content without accessing the URLs inside the json explicitly. The main idea is to fetch download_url... """ full_name = uri.replace("https://api.github.com/repos/", "") diff --git a/minecode/miners/haxe.py b/minecode/miners/haxe.py index 0d7c17ee..0540203e 100644 --- a/minecode/miners/haxe.py +++ b/minecode/miners/haxe.py @@ -72,7 +72,7 @@ def get_uris(self, content): continue href = a["href"] if href and href.startswith("/p/") and href.endswith("/"): - # Parse if the href contains the versino info: + # Parse if the href contains the version info: project_version = href.replace("/p/", "").rstrip("/") project_version = project_version.split("/") if len(project_version) == 2: diff --git a/minecode/miners/maven.py b/minecode/miners/maven.py index 6dca5f71..369da4ea 100644 --- a/minecode/miners/maven.py +++ b/minecode/miners/maven.py @@ -98,7 +98,7 @@ def get_seeds(self): # other repos: http://stackoverflow.com/a/161846/302521 # 1. google has a mirror https://www.infoq.com/news/2015/11/maven-central-at-google # https://maven-central.storage.googleapis.com/repos/central/data/.index/nexus-maven-repository-index.properties - # 2. apache has a possible mirro at http://repo.maven.apache.org/maven2/.index/nexus-maven-repository-index.properties + # 2. apache has a possible mirror at http://repo.maven.apache.org/maven2/.index/nexus-maven-repository-index.properties # 3. ibiblio has an out of date mirror that has no directory listing and was last updated on 20161121171437 # clojars is not a mirror, but its own repo: https://clojars.org/repo/.index/ # other mirrors https://www.google.com/search?q=allinurl%3A%20.index%2Fnexus-maven-repository-index.properties&pws=0&gl=us&gws_rd=cr @@ -121,7 +121,7 @@ def get_uris(self, content): nexus.index.incremental-15=526 nexus.index.incremental-14=527 - Each value points to a fragment increamental index that has the same + Each value points to a fragment incremental index that has the same format as the bigger one. """ base_url = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz" @@ -190,7 +190,7 @@ def get_uris(self, content): # build a URL: This is the real JAR download URL # FIXME: this should be set at the time of creating Artifacts - # instead togther with the filename... especially we could use + # instead together with the filename... especially we could use # different REPOs. jar_download_url, file_name = build_url_and_filename( group_id, artifact_id, version, extension, classifier @@ -258,7 +258,7 @@ def get_uris(self, content): class MavenHTMLPageVisitor(HttpVisitor): """ Parse the HTML page and yield all necessary uris from the page and its sub pages. - Note that the regex of the route expression is using . to map any characters except new line is becasue of the case: + Note that the regex of the route expression is using . to map any characters except new line is because of the case: http://jcenter.bintray.com/'com/virtualightning'/, this is in the test too. """ @@ -292,7 +292,7 @@ def get_uris(self, content): # FIXME this may not be correct. The only thing we can infer from the maven # metadata is wha are the groupid/artifactid and available versions # The actual download files likely need to be obtained from directory listing - # or infered from parsing the POM??? + # or inferred from parsing the POM??? base_url = self.uri.partition("maven-metadata.xml")[0] + "{version}/" pom_url = base_url + "{artifactId}-{version}.pom" @@ -447,7 +447,7 @@ def is_source(classifier): ######################################################################## -# DOCUMENTAION OF the FIELDS aka. Records: +# DOCUMENTATION OF the FIELDS aka. Records: # # Constants and information for field names can be found in # https://github.com/apache/maven-indexer/tree/ecddb3c18ee1ee1357a01bffa7f9cb5252f21209 diff --git a/minecode/miners/nuget.py b/minecode/miners/nuget.py index 1140f98f..55b4cb86 100644 --- a/minecode/miners/nuget.py +++ b/minecode/miners/nuget.py @@ -287,7 +287,7 @@ def build_packages_from_html(metadata, uri, purl=None): h2s = soup.find_all("h2") for h2 in h2s: # Copyright will be after the copyright h2 node - # The exmaple is like this: + # The example is like this: #

Copyright

#

Copyright 2004-2017 The Apache Software Foundation

if h2.string and h2.string == "Copyright": diff --git a/minecode/miners/pypi.py b/minecode/miners/pypi.py index 87e9eb2c..8c251e77 100644 --- a/minecode/miners/pypi.py +++ b/minecode/miners/pypi.py @@ -222,7 +222,7 @@ def build_packages(metadata, purl=None): # FIXME: we should either support "extra" data in a ScannedPackage or just ignore this kind of FIXME comments for now - # FIXME: not supported in ScanCode Package: info.platform may provide some platform infor (possibly UNKNOWN) + # FIXME: not supported in ScanCode Package: info.platform may provide some platform info (possibly UNKNOWN) # FIXME: not supported in ScanCode Package: info.docs_url # FIXME: not supported in ScanCode Package: info.release_url "http://pypi.python.org/pypi/Django/1.10b1" # FIXME: not supported in ScanCode Package: info.classifiers: this contains a lot of other info (platform, license, etc) @@ -232,7 +232,7 @@ def build_packages(metadata, purl=None): # FIXME: Package Index Owner: seems to be only available on the web page # A download_url may be provided for off Pypi download: we yield a package if relevant - # FIXME: do not prioritize the download_url outside Pypi over actual exact Pypi donwload URL + # FIXME: do not prioritize the download_url outside Pypi over actual exact Pypi download URL download_url = info.get("download_url") if download_url and download_url != "UNKNOWN": download_data = dict( diff --git a/minecode/miners/rubygems.py b/minecode/miners/rubygems.py index 875de57a..084514d8 100644 --- a/minecode/miners/rubygems.py +++ b/minecode/miners/rubygems.py @@ -342,7 +342,7 @@ def get_dependencies_from_meta(content): continue requirement = dependency.get("requirement") or {} - # FIXME when upating to the ScanCode package model + # FIXME when updating to the ScanCode package model scope = dependency.get("type") scope = scope and scope.lstrip(":") diff --git a/minecode/models.py b/minecode/models.py index a243b21b..1739b71a 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -163,7 +163,7 @@ def __str__(self): def normalize_fields(self, exclude=None): """ - Why do we normalize? In some weird cases wee may receive damaged + Why do we normalize? In some weird cases we may receive damaged data (e.g. a very long SHA1) and rather than push down the validation or fail an insert we can normalize the data in a single place. @@ -313,7 +313,7 @@ def get_next_visitable(self): def never_mapped(self): """ Limit the QuerySet to ResourceURIs that have never been mapped. - This is usually the state of a ResourceURI after its succesful visit. + This is usually the state of a ResourceURI after its successful visit. """ return self.successfully_visited().filter(last_map_date__isnull=True, wip_date__isnull=True) @@ -370,7 +370,7 @@ class ResourceURI(BaseURI): # This is a text blob that contains either HTML, JSON or anything # stored as a string. This is the raw content of visiting a URI. - # NOTE: some visited URLS (such as an actual package archive will/shoud NOT be stored there) + # NOTE: some visited URLS (such as an actual package archive will/should NOT be stored there) data = models.TextField( null=True, blank=True, @@ -406,7 +406,7 @@ class ResourceURI(BaseURI): db_index=True, default=False, help_text="When set to True (Yes), this field indicates that " - "an error has occured when visiting this URI.", + "an error has occurred when visiting this URI.", ) visit_error = models.TextField( @@ -434,7 +434,7 @@ class ResourceURI(BaseURI): db_index=True, default=False, help_text="When set to True (Yes), this field indicates that " - "an error has occured when mapping this URI.", + "an error has occurred when mapping this URI.", ) map_error = models.TextField( @@ -896,7 +896,7 @@ class PriorityResourceURI(BaseURI): # This is a text blob that contains either HTML, JSON or anything # stored as a string. This is the raw content of visiting a URI. - # NOTE: some visited URLS (such as an actual package archive will/shoud NOT be stored there) + # NOTE: some visited URLS (such as an actual package archive will/should NOT be stored there) data = models.TextField( null=True, blank=True, @@ -931,7 +931,7 @@ class PriorityResourceURI(BaseURI): db_index=True, default=False, help_text="When set to True (Yes), this field indicates that " - "an error has occured when processing this URI.", + "an error has occurred when processing this URI.", ) processing_error = models.TextField( @@ -1025,7 +1025,7 @@ class ImportableURI(BaseURI): # This is a text blob that contains either HTML, JSON or anything # stored as a string. This is the raw content of visiting a URI. - # NOTE: some visited URLS (such as an actual package archive will/shoud NOT be stored there) + # NOTE: some visited URLS (such as an actual package archive will/should NOT be stored there) data = models.TextField( null=True, blank=True, @@ -1052,7 +1052,7 @@ class ImportableURI(BaseURI): db_index=True, default=False, help_text="When set to True (Yes), this field indicates that " - "an error has occured when processing this URI.", + "an error has occurred when processing this URI.", ) processing_error = models.TextField( @@ -1077,14 +1077,14 @@ class ProcessingError(BaseURI): max_length=100, null=True, blank=True, - help_text="The name of the service running where the error occured.", + help_text="The name of the service running where the error occurred.", ) date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text="Timestamp set to the date of when this error occured.", + help_text="Timestamp set to the date of when this error occurred.", ) error_message = models.TextField( diff --git a/minecode/route.py b/minecode/route.py index 0b0445c7..e3fb43ef 100644 --- a/minecode/route.py +++ b/minecode/route.py @@ -18,10 +18,10 @@ There are several routing implementations available in Rails, Django, Flask, Paste, etc. However, these all assume that the routed processing is to craft a -response to an incomming external HTTP request. +response to an incoming external HTTP request. Here we are instead doing the opposite: given a URI (and no request yet) we are -routing the processing to emitt a request extrenally (HTTP or other protocol) +routing the processing to emit a request externally (HTTP or other protocol) and handling its response. Also we crawl a lot and not only HTTP: git, svn, ftp, rsync and more. diff --git a/minecode/seed.py b/minecode/seed.py index abba367a..5128b564 100644 --- a/minecode/seed.py +++ b/minecode/seed.py @@ -21,8 +21,8 @@ class Seeder: """ Abstract base class for seeding URIs to visit. Each visitor should create a - subclass of Seeder and implement the get_seeds method to yield the top levle - URIs required to bootstrap the visiting process. The framework decides waht + subclass of Seeder and implement the get_seeds method to yield the top level + URIs required to bootstrap the visiting process. The framework decides what to do with these seeds, but will typically ensure they exist as ResourceURIs in the DB. To be used, seeder classes must be added to the list of active Seeders in the settings module. diff --git a/minecode/utils_test.py b/minecode/utils_test.py index fa64f59c..65ce33a0 100644 --- a/minecode/utils_test.py +++ b/minecode/utils_test.py @@ -304,7 +304,7 @@ def check_expected_uris(self, uris, expected_loc, data_is_json=False, regen=FIXT if uri_dict.get("date"): # Parse date since date will be used as Date field in # ResourceURI object, to make it as string format is just for - # test comparation. + # test comparison. # FIXME: we should ONLY have strings there!!! uri_dict["date"] = str(uri_dict.get("date")) results.append(uri_dict) diff --git a/minecode/version.py b/minecode/version.py index 5d25ea9a..3952c1b9 100644 --- a/minecode/version.py +++ b/minecode/version.py @@ -192,5 +192,5 @@ def strip_pre_releases(version_string): for tag in PRE_RELEASE_TAGS: if tag not in version_string: continue - splitted = version_string.split(tag) - return splitted[0] + split_parts = version_string.split(tag) + return split_parts[0] diff --git a/minecode_pipelines/miners/cpan.py b/minecode_pipelines/miners/cpan.py index 98334222..c006bf89 100644 --- a/minecode_pipelines/miners/cpan.py +++ b/minecode_pipelines/miners/cpan.py @@ -27,7 +27,7 @@ def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None): """ Get cpan package names parsed from the `02packages.details.txt` - which conatins a list of all modules and their respective + which contains a list of all modules and their respective package archive paths. We parse the package names and their respective path_prefixes with author page path from this list. """ diff --git a/minecode_pipelines/miners/npm.py b/minecode_pipelines/miners/npm.py index 8d3c4df8..512474cf 100644 --- a/minecode_pipelines/miners/npm.py +++ b/minecode_pipelines/miners/npm.py @@ -28,7 +28,7 @@ in paginated queries. https://replicate.npmjs.com/_changes -This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which +This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequences which can be fetched in paginated queries. https://registry.npmjs.org/{namespace/name} diff --git a/minecode_pipelines/pipelines/__init__.py b/minecode_pipelines/pipelines/__init__.py index 844f7f1b..0e8ffdcf 100644 --- a/minecode_pipelines/pipelines/__init__.py +++ b/minecode_pipelines/pipelines/__init__.py @@ -38,7 +38,7 @@ class MineCodeBasePipeline(Pipeline): download_inputs = False - # Control wether to ovewrite or append mined purls to existing `purls.yml` file + # Control whether to overwrite or append mined purls to existing `purls.yml` file append_purls = False checked_out_repos = {} @@ -68,7 +68,7 @@ def packages_count(self) -> int: Return the estimated number of packages for which PackageURLs are to be mined. Used by ``mine_and_publish_packageurls`` to log the progress of PackageURL mining. - Note: If estimating package count is not feasable return `None` + Note: If estimating package count is not feasible return `None` """ raise NotImplementedError diff --git a/minecode_pipelines/pipes/maven.py b/minecode_pipelines/pipes/maven.py index 4b58dfa7..dd536194 100644 --- a/minecode_pipelines/pipes/maven.py +++ b/minecode_pipelines/pipes/maven.py @@ -102,7 +102,7 @@ def is_source(classifier): ######################################################################## -# DOCUMENTAION OF the FIELDS aka. Records: +# DOCUMENTATION OF the FIELDS aka. Records: # # Constants and information for field names can be found in # https://github.com/apache/maven-indexer/tree/ecddb3c18ee1ee1357a01bffa7f9cb5252f21209 @@ -683,7 +683,7 @@ def _get_packages(self, content=None): # build a URL: This is the real JAR download URL # FIXME: this should be set at the time of creating Artifacts - # instead togther with the filename... especially we could use + # instead together with the filename... especially we could use # different REPOs. jar_download_url, _ = build_url_and_filename( group_id, artifact_id, version, extension, classifier diff --git a/minecode_pipelines/pipes/npm.py b/minecode_pipelines/pipes/npm.py index a09a8e6a..078b19af 100644 --- a/minecode_pipelines/pipes/npm.py +++ b/minecode_pipelines/pipes/npm.py @@ -74,7 +74,7 @@ def mine_npm_packages(logger=None): 1. first sync: we get latest set of packages from the "_all_docs" API endpoint of npm replicate and save this and last sequence of the package to checkpoints. - 2. intial sync: we get packages from checkpoint which we're trying to sync upto + 2. initial sync: we get packages from checkpoint which we're trying to sync up to 3. periodic sync: we get latest packages newly released in npm through the "_changes" API, for a period, from our last mined sequence of package. """ @@ -286,7 +286,7 @@ def save_mined_packages_in_checkpoint(packages_mined, synced_packages, config_re def update_state_and_checkpoints(state, last_seq, config_repo, logger=None): - # If we are finished mining all the packages in the intial sync, we can now + # If we are finished mining all the packages in the initial sync, we can now # periodically sync the packages from latest if state == INITIAL_SYNC_STATE: if logger: diff --git a/minecode_pipelines/pipes/pypi.py b/minecode_pipelines/pipes/pypi.py index beb5d08b..7e3eec4a 100644 --- a/minecode_pipelines/pipes/pypi.py +++ b/minecode_pipelines/pipes/pypi.py @@ -67,7 +67,7 @@ def mine_pypi_packages(logger=None): Mine pypi package names from pypi simple and save to checkpoints, or get packages from saved checkpoints. We have 3 cases: 1. periodic sync: we get latest packages newly released in pypi, for a period - 2. intial sync: we get packages from checkpoint which we're trying to sync upto + 2. initial sync: we get packages from checkpoint which we're trying to sync up to 3. first sync: we get latest packages from pypi and save to checkpoints """ @@ -257,7 +257,7 @@ def save_mined_packages_in_checkpoint(packages_mined, config_repo, logger=None): def update_state_and_checkpoints(state, config_repo, last_serial, logger=None): - # If we are finshed mining all the packages in the intial sync, we can now + # If we are finished mining all the packages in the initial sync, we can now # periodically sync the packages from latest if state == INITIAL_SYNC_STATE: if logger: diff --git a/packagedb/api.py b/packagedb/api.py index 73fc4d91..de15ab2f 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -204,7 +204,7 @@ class ResourceViewSet(viewsets.ReadOnlyModelViewSet): @action(detail=False, methods=["post"]) def filter_by_checksums(self, request, *args, **kwargs): """ - Take a mapping, where the keys are the names of the checksum algorthm + Take a mapping, where the keys are the names of the checksum algorithm and the values is a list of checksum values and query those values against the packagedb. @@ -417,7 +417,7 @@ def get_enhanced_package_data(self, request, *args, **kwargs): @action(detail=False, methods=["post"]) def filter_by_checksums(self, request, *args, **kwargs): """ - Take a mapping, where the keys are the names of the checksum algorthm + Take a mapping, where the keys are the names of the checksum algorithm and the values is a list of checksum values and query those values against the packagedb. @@ -749,7 +749,7 @@ class CollectViewSet(viewsets.ViewSet): Optionally, use a list of `addon_pipelines` to use for the scan. See add-on pipelines at https://scancodeio.readthedocs.io/en/latest/built-in-pipelines.html - Paremeters: + Parameters: - `purl`: (required, string) a PURL, with a version.