diff --git a/minecode_pipelines/pipelines/mine_crates.py b/minecode_pipelines/pipelines/mine_crates.py new file mode 100644 index 00000000..16a1e6bc --- /dev/null +++ b/minecode_pipelines/pipelines/mine_crates.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from scanpipe.pipes import federatedcode + +from minecode_pipelines import pipes +from minecode_pipelines.pipelines import MineCodeBasePipeline +from minecode_pipelines.pipelines import _mine_and_publish_packageurls +from minecode_pipelines.pipes import crates + + +class MineCrates(MineCodeBasePipeline): + """Mine PackageURLs from crates.io-index and publish them to FederatedCode.""" + + pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/" + checkpoint_path = "crates/checkpoints.json" + append_purls = True + + crates_index_repo_url = "https://github.com/rust-lang/crates.io-index" + + last_checkpoint = "" + current_utc = "" + + @classmethod + def steps(cls): + return ( + cls.check_federatedcode_eligibility, + cls.create_federatedcode_working_dir, + cls.fetch_federation_config, + cls.fetch_checkpoint_and_crates_io_index, + cls.get_current_utc, + cls.mine_and_publish_crates_packageurls, + cls.save_check_point, + cls.delete_working_dir, + ) + + def fetch_checkpoint_and_crates_io_index(self): + self.checkpoint_config_repo = federatedcode.clone_repository( + repo_url=self.pipeline_config_repo, + clone_path=self.working_path / "minecode-pipelines-config", + logger=self.log, + ) + checkpoint = pipes.get_checkpoint_from_file( + cloned_repo=self.checkpoint_config_repo, + path=self.checkpoint_path, + ) + if checkpoint: + self.last_checkpoint = checkpoint.get("previous_index_date") + self.log(f"last_checkpoint: {self.last_checkpoint}") + + # Clone the crates.io-index repository + self.crates_index_repo = federatedcode.clone_repository( + repo_url=self.crates_index_repo_url, + clone_path=self.working_path / "crates_index_repo", + logger=self.log, + ) + + self.crates_collector = crates.CratesCollector( + repo_location=self.crates_index_repo, + logger=self.log, + ) + + def get_current_utc(self): + from datetime import datetime, timezone + + self.current_utc = datetime.now(timezone.utc).isoformat() + + def mine_and_publish_crates_packageurls(self): + _mine_and_publish_packageurls( + packageurls=self.crates_collector.get_packages( + previous_index_date=self.last_checkpoint + ), + total_package_count=None, + data_cluster=self.data_cluster, + checked_out_repos=self.checked_out_repos, + working_path=self.working_path, + append_purls=self.append_purls, + commit_msg_func=self.commit_message, + logger=self.log, + ) + + def save_check_point(self): + checkpoint = {"previous_index_date": self.current_utc} + + self.log(f"Saving checkpoint: {checkpoint}") + pipes.update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=self.checkpoint_config_repo, + path=self.checkpoint_path, + logger=self.log, + ) diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index 076e2ac4..fe8e0c87 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -63,9 +63,14 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path): def get_checkpoint_from_file(cloned_repo, path): checkpoint_path = os.path.join(cloned_repo.working_dir, path) - with open(checkpoint_path) as f: - checkpoint_data = json.load(f) - return checkpoint_data or {} + try: + with open(checkpoint_path) as f: + checkpoint_data = json.load(f) + except FileNotFoundError: + return {} + except json.JSONDecodeError: + return {} + return checkpoint_data def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None): diff --git a/minecode_pipelines/pipes/crates.py b/minecode_pipelines/pipes/crates.py new file mode 100644 index 00000000..a2bdc111 --- /dev/null +++ b/minecode_pipelines/pipes/crates.py @@ -0,0 +1,89 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +import requests +from dateutil import parser +from packagedcode.models import PackageData +from packageurl import PackageURL + + +TRACE = False +TRACE_DEEP = False + +CRATES_API_URL = "https://crates.io/api/v1/crates/" + + +class CratesCollector: + def __init__( + self, + repo_location=None, + logger=None, + ): + if not repo_location: + raise Exception("repo_location must be set for CratesCollector.") + self.repo_location = repo_location + + def get_packages(self, previous_index_date=None, logger=None): + """Yield Package objects from crates.io-index""" + base_dir = self.repo_location.working_dir + + previous_index_date_parsed = "" + if previous_index_date: + previous_index_date_parsed = parser.isoparse(previous_index_date) + + for root, dirs, filenames in os.walk(base_dir): + # Skip .github and .git directories at the top level + if root == base_dir: + dirs.remove(".github") + dirs.remove(".git") + # Skip README.md and config.json at the top level + filenames = [f for f in filenames if f not in ("README.md", "config.json")] + + for crate_name in filenames: + url = f"{CRATES_API_URL}/{crate_name}" + response = requests.get(url) + if not response.status_code == 200: + self.logger(f"Error fetching {crate_name}: {response.status_code}") + else: + data = response.json() + crate_versions_info = data.get("versions", {}) + for crate_version_info in crate_versions_info: + package_last_update = crate_version_info.get("updated_at", "") + if previous_index_date_parsed and package_last_update: + last_update = parser.isoparse(package_last_update) + if last_update < previous_index_date_parsed: + continue + name = crate_version_info.get("crate") + version = crate_version_info.get("num") + download_url = "https://crates.io" + crate_version_info.get("dl_path", "") + release_date = crate_version_info.get("created_at", "") + sha256 = crate_version_info.get("checksum", "") + homepage_url = crate_version_info.get("homepage", "") + if not homepage_url: + homepage_url = crate_version_info.get("repository", "") + + package = PackageData( + type="maven", + namespace=None, + name=name, + version=version, + qualifiers=None, + download_url=download_url, + sha256=sha256, + release_date=release_date, + repository_homepage_url=homepage_url, + repository_download_url=download_url, + ) + current_purl = PackageURL( + type="maven", + name=name, + version=version, + ) + yield current_purl, [package.purl] diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 857597e6..b035470d 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -62,6 +62,7 @@ mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan" mine_cran = "minecode_pipelines.pipelines.mine_cran:MineCran" mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift" mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer" +mine_crates = "minecode_pipelines.pipelines.mine_crates:MineCrates" [tool.bumpversion] current_version = "0.1.1"