Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions minecode_pipelines/pipelines/mine_crates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from scanpipe.pipes import federatedcode

from minecode_pipelines import pipes
from minecode_pipelines.pipelines import MineCodeBasePipeline
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
from minecode_pipelines.pipes import crates


class MineCrates(MineCodeBasePipeline):
"""Mine PackageURLs from crates.io-index and publish them to FederatedCode."""

pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/"
checkpoint_path = "crates/checkpoints.json"
append_purls = True

crates_index_repo_url = "https://github.com/rust-lang/crates.io-index"

last_checkpoint = ""
current_utc = ""

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.create_federatedcode_working_dir,
cls.fetch_federation_config,
cls.fetch_checkpoint_and_crates_io_index,
cls.get_current_utc,
cls.mine_and_publish_crates_packageurls,
cls.save_check_point,
cls.delete_working_dir,
)

def fetch_checkpoint_and_crates_io_index(self):
self.checkpoint_config_repo = federatedcode.clone_repository(
repo_url=self.pipeline_config_repo,
clone_path=self.working_path / "minecode-pipelines-config",
logger=self.log,
)
checkpoint = pipes.get_checkpoint_from_file(
cloned_repo=self.checkpoint_config_repo,
path=self.checkpoint_path,
)
if checkpoint:
self.last_checkpoint = checkpoint.get("previous_index_date")
self.log(f"last_checkpoint: {self.last_checkpoint}")

# Clone the crates.io-index repository
self.crates_index_repo = federatedcode.clone_repository(
repo_url=self.crates_index_repo_url,
clone_path=self.working_path / "crates_index_repo",
logger=self.log,
)

self.crates_collector = crates.CratesCollector(
repo_location=self.crates_index_repo,
logger=self.log,
)

def get_current_utc(self):
from datetime import datetime, timezone

self.current_utc = datetime.now(timezone.utc).isoformat()

def mine_and_publish_crates_packageurls(self):
_mine_and_publish_packageurls(
packageurls=self.crates_collector.get_packages(
previous_index_date=self.last_checkpoint
),
total_package_count=None,
data_cluster=self.data_cluster,
checked_out_repos=self.checked_out_repos,
working_path=self.working_path,
append_purls=self.append_purls,
commit_msg_func=self.commit_message,
logger=self.log,
)

def save_check_point(self):
checkpoint = {"previous_index_date": self.current_utc}

self.log(f"Saving checkpoint: {checkpoint}")
pipes.update_checkpoints_in_github(
checkpoint=checkpoint,
cloned_repo=self.checkpoint_config_repo,
path=self.checkpoint_path,
logger=self.log,
)
11 changes: 8 additions & 3 deletions minecode_pipelines/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,14 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path):

def get_checkpoint_from_file(cloned_repo, path):
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
with open(checkpoint_path) as f:
checkpoint_data = json.load(f)
return checkpoint_data or {}
try:
with open(checkpoint_path) as f:
checkpoint_data = json.load(f)
except FileNotFoundError:
return {}
except json.JSONDecodeError:
return {}
return checkpoint_data


def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None):
Expand Down
89 changes: 89 additions & 0 deletions minecode_pipelines/pipes/crates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import os
import requests
from dateutil import parser
from packagedcode.models import PackageData
from packageurl import PackageURL


TRACE = False
TRACE_DEEP = False

CRATES_API_URL = "https://crates.io/api/v1/crates/"


class CratesCollector:
def __init__(
self,
repo_location=None,
logger=None,
):
if not repo_location:
raise Exception("repo_location must be set for CratesCollector.")
self.repo_location = repo_location

def get_packages(self, previous_index_date=None, logger=None):
"""Yield Package objects from crates.io-index"""
base_dir = self.repo_location.working_dir

previous_index_date_parsed = ""
if previous_index_date:
previous_index_date_parsed = parser.isoparse(previous_index_date)

for root, dirs, filenames in os.walk(base_dir):
# Skip .github and .git directories at the top level
if root == base_dir:
dirs.remove(".github")
dirs.remove(".git")
# Skip README.md and config.json at the top level
filenames = [f for f in filenames if f not in ("README.md", "config.json")]

for crate_name in filenames:
url = f"{CRATES_API_URL}/{crate_name}"
response = requests.get(url)
if not response.status_code == 200:
self.logger(f"Error fetching {crate_name}: {response.status_code}")
else:
data = response.json()
crate_versions_info = data.get("versions", {})
for crate_version_info in crate_versions_info:
package_last_update = crate_version_info.get("updated_at", "")
if previous_index_date_parsed and package_last_update:
last_update = parser.isoparse(package_last_update)
if last_update < previous_index_date_parsed:
continue
name = crate_version_info.get("crate")
version = crate_version_info.get("num")
download_url = "https://crates.io" + crate_version_info.get("dl_path", "")
release_date = crate_version_info.get("created_at", "")
sha256 = crate_version_info.get("checksum", "")
homepage_url = crate_version_info.get("homepage", "")
if not homepage_url:
homepage_url = crate_version_info.get("repository", "")

package = PackageData(
type="maven",
namespace=None,
name=name,
version=version,
qualifiers=None,
download_url=download_url,
sha256=sha256,
release_date=release_date,
repository_homepage_url=homepage_url,
repository_download_url=download_url,
)
current_purl = PackageURL(
type="maven",
name=name,
version=version,
)
yield current_purl, [package.purl]
1 change: 1 addition & 0 deletions pyproject-minecode_pipelines.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan"
mine_cran = "minecode_pipelines.pipelines.mine_cran:MineCran"
mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift"
mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer"
mine_crates = "minecode_pipelines.pipelines.mine_crates:MineCrates"

[tool.bumpversion]
current_version = "0.1.1"
Expand Down