diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py
index 8f0fb4d3..bf6bd360 100644
--- a/minecode/collectors/maven.py
+++ b/minecode/collectors/maven.py
@@ -476,9 +476,6 @@ def process_request(purl_str, **kwargs):
collect_links = re.compile(r'href="([^"]+)"').findall
-collect_links_and_artifact_timestamps = re.compile(
- r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)'
-).findall
def check_if_file_name_is_linked_on_page(file_name, links, **kwargs):
@@ -675,6 +672,62 @@ def filter_for_artifacts(timestamps_by_links):
return timestamps_by_links_filtered
+def collect_links_and_artifact_timestamps(text):
+ # Return a list of sets containing all link locations and their
+ # corresponding timestamps extracted from a given HTML text.
+
+ # Pattern that matches with https://repo.maven.apache.org/maven2
+ maven_apache_pattern = re.compile(
+ r']*>[^<]*\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)'
+ )
+ maven_apache_matches = maven_apache_pattern.findall(text)
+ if maven_apache_matches:
+ return maven_apache_matches
+
+ # Pattern that matces with
+ # both Apache (UTC) and Nexus (Z) formats
+ # https://repository.jboss.org/nexus/service/rest/repository/browse/releases/
+ # https://repository.jboss.org/nexus/service/rest/repository/browse/public/
+ # https://repository.apache.org/snapshots/
+ repo_jboss_apache_pattern = re.compile(
+ r']*>[^<]*\s*
\s*((?:[A-Z][a-z]{2}\s+[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+(?:UTC|Z)\s+\d{4})| )\s* | '
+ )
+ repo_jboss_apache_matches = repo_jboss_apache_pattern.findall(text)
+ # Convert to empty string for table format
+ if repo_jboss_apache_matches:
+ return [
+ (item, "" if timestamp == " " else timestamp)
+ for item, timestamp in repo_jboss_apache_matches
+ ]
+
+ # Pattern that matches with
+ # https://repo.spring.io/milestone
+ repo_spring_pattern = re.compile(
+ r']*>[^<]*\s+(\d{2}-[A-Z][a-z]{2}-\d{4}\s+\d{2}:\d{2})'
+ )
+ repo_spring_matches = repo_spring_pattern.findall(text)
+ if repo_spring_matches:
+ return repo_spring_matches
+
+ # Simple links in tags without timestamps (Gradle plugins format)
+ # https://plugins.gradle.org/m2/
+ plugins_gradle_pattern = re.compile(r']*>[^<]*
')
+ plugins_gradle_matches = plugins_gradle_pattern.findall(text)
+ if plugins_gradle_matches:
+ # Filter out parent directory link if present
+ filtered_matches = []
+ for href in plugins_gradle_matches:
+ # Skip parent directory links
+ if href != "../" and not href.startswith(".."):
+ filtered_matches.append((href, ""))
+
+ # Only return if we found non-parent links
+ if filtered_matches:
+ return filtered_matches
+
+ return []
+
+
def collect_links_from_text(text, filter):
"""
Return a mapping of link locations and their timestamps, given HTML `text`
@@ -700,7 +753,7 @@ def create_absolute_urls_for_links(text, url, filter):
url = url.rstrip("/")
timestamps_by_links = collect_links_from_text(text, filter)
for link, timestamp in timestamps_by_links.items():
- if not link.startswith(url):
+ if not link.startswith("http:") and not link.startswith("https:"):
link = f"{url}/{link}"
timestamps_by_absolute_links[link] = timestamp
return timestamps_by_absolute_links
@@ -758,23 +811,20 @@ def get_artifact_sha1(artifact_url):
return sha1
-def get_classifier_from_artifact_url(
- artifact_url, package_version_page_url, package_name, package_version
-):
+def get_classifier_from_artifact_url(artifact_url, package_name, package_version):
"""
Return the classifier from a Maven artifact URL `artifact_url`, otherwise
return None if a classifier cannot be determined from `artifact_url`
"""
classifier = None
- # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0
- package_version_page_url = package_version_page_url.rstrip("/")
- # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0
- leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}"
+ package_name_version_portion = f"{package_name}-{package_version}"
+ artifact_url_filename = artifact_url.rsplit("/", 1)[-1]
+ remaining_url_portion = artifact_url_filename.replace(package_name_version_portion, "")
# artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar'
- # ['', '-onejar.jar']
- _, remaining_url_portion = artifact_url.split(leading_url_portion)
- # ['-onejar', 'jar']
+ # artifact_url_filename = 'livereload-jvm-0.2.0-onejar.jar'
+ # remaining_url_portion = '-onejar.jar'
remaining_url_portions = remaining_url_portion.split(".")
+ # ['-onejar', 'jar']
if remaining_url_portions and remaining_url_portions[0]:
# '-onejar'
classifier = remaining_url_portions[0]
diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py
index ed754e31..f50895fa 100644
--- a/minecode/management/commands/import_queue.py
+++ b/minecode/management/commands/import_queue.py
@@ -132,12 +132,14 @@ def process_request(importable_uri):
timestamps_by_artifact_links = get_artifact_links(version_page_url)
for artifact_link, timestamp in timestamps_by_artifact_links.items():
sha1 = get_artifact_sha1(artifact_link)
- classifier = get_classifier_from_artifact_url(
- artifact_link, version_page_url, name, version
- )
+ classifier = get_classifier_from_artifact_url(artifact_link, name, version)
qualifiers = None
if classifier:
qualifiers = f"classifier={classifier}"
+ if timestamp:
+ release_date = dateutil_parse(timestamp)
+ else:
+ release_date = None
release_date = dateutil_parse(timestamp)
package_data = PackageData(
type="maven",
diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py
index df6da9cf..baf1fbc7 100644
--- a/minecode/management/commands/maven_crawler.py
+++ b/minecode/management/commands/maven_crawler.py
@@ -26,5 +26,15 @@ class Command(VerboseCommand):
help = "Run a Package request queue."
def handle(self, *args, **options):
- maven_root_url = "https://repo.maven.apache.org/maven2"
- crawl_maven_repo_from_root(root_url=maven_root_url)
+ # Add the maven root URLs
+ # Ref: https://github.com/aboutcode-org/purldb/issues/630#issuecomment-3599942716
+ maven_root_urls = [
+ "https://repo.maven.apache.org/maven2",
+ "https://repo.spring.io/artifactory/milestone",
+ "https://plugins.gradle.org/m2",
+ "https://repository.apache.org/content/groups/snapshots",
+ "https://repository.jboss.org/nexus/service/rest/repository/browse/releases",
+ "https://repository.jboss.org/nexus/service/rest/repository/browse/public",
+ ]
+ for maven_root_url in maven_root_urls:
+ crawl_maven_repo_from_root(root_url=maven_root_url)
diff --git a/minecode/tests/collectors/test_maven.py b/minecode/tests/collectors/test_maven.py
index 541bf28b..05d31f6c 100644
--- a/minecode/tests/collectors/test_maven.py
+++ b/minecode/tests/collectors/test_maven.py
@@ -201,7 +201,7 @@ def test_get_merged_ancestor_package_from_maven_package(
class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase):
- test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles")
+ test_data_dir = os.path.join(os.path.dirname(__file__), "../testfiles")
def test_check_if_file_name_is_linked_on_page(self):
links = ["foo/", "bar/", "baz/"]
@@ -500,12 +500,80 @@ def test_get_artifact_sha1(self, mock_request_get):
def test_get_classifier_from_artifact_url(self):
artifact_url = "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar"
- package_version_page_url = (
- "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/"
- )
package_name = "livereload-jvm"
package_version = "0.2.0"
classifier = maven.get_classifier_from_artifact_url(
- artifact_url, package_version_page_url, package_name, package_version
+ artifact_url, package_name, package_version
)
self.assertEqual("onejar", classifier)
+
+ def test_collect_links_and_artifact_timestamps_repo_maven_apache_org(self):
+ # https://repo.maven.apache.org/maven2
+ with open(self.get_test_loc("maven/html/maven.apache.org/abbot.html")) as file:
+ text = file.read()
+ expected = [
+ ("1.4.0/", "2015-09-22 16:03"),
+ ("maven-metadata.xml", "2015-09-24 14:18"),
+ ]
+
+ self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))
+
+ def test_collect_links_and_artifact_timestamps_repository_jboss_org(self):
+ # https://repository.jboss.org/nexus/service/rest/repository/browse/public/
+ # https://repository.jboss.org/nexus/service/rest/repository/browse/releases/
+ with open(self.get_test_loc("maven/html/repository.jboss.org/commons-codec.html")) as file:
+ text = file.read()
+ expected = [
+ ("1.2/", ""),
+ (
+ "https://repository.jboss.org/nexus/repository/public/apache-codec/commons-codec/maven-metadata.xml",
+ "Fri Sep 05 09:38:07 Z 2025",
+ ),
+ ]
+
+ self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))
+
+ def test_collect_links_and_artifact_timestamps_repository_apache_org(self):
+ # https://repository.apache.org/snapshots/
+ with open(self.get_test_loc("maven/html/repository.apache.org/common-chain.html")) as file:
+ text = file.read()
+ expected = [
+ (
+ "https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/1.3-SNAPSHOT/",
+ "Thu Jul 04 05:45:00 UTC 2013",
+ ),
+ (
+ "https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/2.0-SNAPSHOT/",
+ "Tue Aug 21 20:26:48 UTC 2018",
+ ),
+ (
+ "https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/maven-metadata.xml.md5",
+ "Tue Aug 21 20:26:47 UTC 2018",
+ ),
+ (
+ "https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/maven-metadata.xml.sha1",
+ "Tue Aug 21 20:26:47 UTC 2018",
+ ),
+ ]
+
+ self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))
+
+ def test_collect_links_and_artifact_timestamps_repo_spring_io(self):
+ # https://repo.spring.io/release
+ with open(self.get_test_loc("maven/html/repo.spring.io/scstest.html")) as file:
+ text = file.read()
+ expected = [
+ ("0.0.11.M2/", "07-Aug-2019 08:40"),
+ ("0.0.11.RC2/", "07-Aug-2019 08:36"),
+ ("maven-metadata.xml", "07-Aug-2019 09:07"),
+ ]
+
+ self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))
+
+ def test_collect_links_and_artifact_timestamps_plugin_gradle_org(self):
+ # https://plugins.gradle.org/m2/
+ with open(self.get_test_loc("maven/html/plugins.gradle.org/test.html")) as file:
+ text = file.read()
+ expected = [("0.0.10/", ""), ("1.0.1/", ""), ("1.1.0/", ""), ("maven-metadata.xml", "")]
+
+ self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))
diff --git a/minecode/tests/testfiles/maven/html/maven.apache.org/abbot.html b/minecode/tests/testfiles/maven/html/maven.apache.org/abbot.html
new file mode 100644
index 00000000..7b2ce8fd
--- /dev/null
+++ b/minecode/tests/testfiles/maven/html/maven.apache.org/abbot.html
@@ -0,0 +1,29 @@
+
+
+
+
+ Central Repository: abbot/abbot
+
+
+
+
+
+
+
+
+
+../
+1.4.0/ 2015-09-22 16:03 -
+maven-metadata.xml 2015-09-24 14:18 402
+
+
+
+
+
+
diff --git a/minecode/tests/testfiles/maven/html/plugins.gradle.org/test.html b/minecode/tests/testfiles/maven/html/plugins.gradle.org/test.html
new file mode 100644
index 00000000..b4df3484
--- /dev/null
+++ b/minecode/tests/testfiles/maven/html/plugins.gradle.org/test.html
@@ -0,0 +1,10 @@
+
+
+
+
+0.0.10/
+1.0.1/
+1.1.0/
+maven-metadata.xml
+
+
diff --git a/minecode/tests/testfiles/maven/html/repo.spring.io/scstest.html b/minecode/tests/testfiles/maven/html/repo.spring.io/scstest.html
new file mode 100644
index 00000000..b0c8c1f4
--- /dev/null
+++ b/minecode/tests/testfiles/maven/html/repo.spring.io/scstest.html
@@ -0,0 +1,14 @@
+
+
+
+Index of milestone/com/albertoimpl/test/scstest/releasetest
+
+
+Index of milestone/com/albertoimpl/test/scstest/releasetest
+Name Last modified Size
+../
+0.0.11.M2/ 07-Aug-2019 08:40 -
+0.0.11.RC2/ 07-Aug-2019 08:36 -
+maven-metadata.xml 07-Aug-2019 09:07 449 bytes
+
+
Artifactory Online Server
diff --git a/minecode/tests/testfiles/maven/html/repository.apache.org/common-chain.html b/minecode/tests/testfiles/maven/html/repository.apache.org/common-chain.html
new file mode 100644
index 00000000..29937019
--- /dev/null
+++ b/minecode/tests/testfiles/maven/html/repository.apache.org/common-chain.html
@@ -0,0 +1,59 @@
+
+
+ Index of /groups/snapshots/commons-chain/commons-chain
+
+
+
+
+
+
+
+
+ Index of /groups/snapshots/commons-chain/commons-chain
+
+
+
diff --git a/minecode/tests/testfiles/maven/html/repository.jboss.org/commons-codec.html b/minecode/tests/testfiles/maven/html/repository.jboss.org/commons-codec.html
new file mode 100644
index 00000000..483c331c
--- /dev/null
+++ b/minecode/tests/testfiles/maven/html/repository.jboss.org/commons-codec.html
@@ -0,0 +1,50 @@
+
+
+
+ Index of /apache-codec/commons-codec
+
+
+
+
+
+
+
+
+
+
+Index of /apache-codec/commons-codec
+
+
+
+
+
diff --git a/minecode_pipelines/pipelines/mine_maven.py b/minecode_pipelines/pipelines/mine_maven.py
index e11c7b6c..20ddcdf7 100644
--- a/minecode_pipelines/pipelines/mine_maven.py
+++ b/minecode_pipelines/pipelines/mine_maven.py
@@ -20,6 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
+from aboutcode.pipeline import optional_step
from scanpipe.pipes import federatedcode
from minecode_pipelines import pipes
@@ -32,7 +33,7 @@ class MineMaven(MineCodeBasePipeline):
"""Mine PackageURLs from maven index and publish them to FederatedCode."""
pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/"
- checkpoint_path = "maven/checkpoints.json"
+
append_purls = True
@classmethod
@@ -41,31 +42,226 @@ def steps(cls):
cls.check_federatedcode_eligibility,
cls.create_federatedcode_working_dir,
cls.fetch_federation_config,
- cls.fetch_checkpoint_and_maven_index,
- cls.mine_and_publish_maven_packageurls,
- cls.save_check_point,
+ cls.fetch_checkpoint_config_repo,
+ cls.fetch_maven_index_repo1_maven_org,
+ cls.mine_and_publish_maven_packageurls_repo1_maven_org,
+ cls.save_check_point_repo1_maven_org,
+ cls.fetch_maven_index_repo_spring_io_release,
+ cls.mine_and_publish_maven_packageurls_repo_spring_io_release,
+ cls.save_check_point_repo_spring_io_release,
+ cls.fetch_maven_index_repo_spring_io_milestone,
+ cls.mine_and_publish_maven_packageurls_repo_spring_io_milestone,
+ cls.save_check_point_repo_spring_io_milestone,
+ cls.fetch_maven_index_plugins_gradle_org,
+ cls.mine_and_publish_maven_packageurls_plugins_gradle_org,
+ cls.save_check_point_plugins_gradle_org,
+ cls.fetch_maven_index_repository_apache_org,
+ cls.mine_and_publish_maven_packageurls_repository_apache_org,
+ cls.save_check_point_repository_apache_org,
cls.delete_working_dir,
)
- def fetch_checkpoint_and_maven_index(self):
+ def fetch_checkpoint_config_repo(self):
self.checkpoint_config_repo = federatedcode.clone_repository(
repo_url=self.pipeline_config_repo,
clone_path=self.working_path / "minecode-pipelines-config",
logger=self.log,
)
+
+ @optional_step("repo1.maven.org")
+ def fetch_maven_index_repo1_maven_org(self):
+ checkpoint_path = "maven/repo.maven.org/checkpoints.json"
+ maven_url = "https://repo1.maven.org/maven2"
checkpoint = pipes.get_checkpoint_from_file(
cloned_repo=self.checkpoint_config_repo,
- path=self.checkpoint_path,
+ path=checkpoint_path,
+ )
+ last_incremental = checkpoint.get("last_incremental")
+ self.log(f"last_incremental: {last_incremental}")
+ self.maven_nexus_collector = maven.MavenNexusCollector(
+ maven_url=maven_url,
+ last_incremental=last_incremental,
+ logger=self.log,
)
+ @optional_step("repo1.maven.org")
+ def mine_and_publish_maven_packageurls_repo1_maven_org(self):
+ _mine_and_publish_packageurls(
+ packageurls=self.maven_nexus_collector.get_packages(),
+ total_package_count=None,
+ data_cluster=self.data_cluster,
+ checked_out_repos=self.checked_out_repos,
+ working_path=self.working_path,
+ append_purls=self.append_purls,
+ commit_msg_func=self.commit_message,
+ logger=self.log,
+ )
+
+ @optional_step("repo1.maven.org")
+ def save_check_point_repo1_maven_org(self):
+ checkpoint_path = "maven/repo.maven.org/checkpoints.json"
+ last_incremental = self.maven_nexus_collector.index_properties.get(
+ "nexus.index.last-incremental"
+ )
+ checkpoint = {"last_incremental": last_incremental}
+ self.log(f"Saving checkpoint: {checkpoint}")
+ pipes.update_checkpoints_in_github(
+ checkpoint=checkpoint,
+ cloned_repo=self.checkpoint_config_repo,
+ path=checkpoint_path,
+ logger=self.log,
+ )
+
+ @optional_step("repo.spring.io/release")
+ def fetch_maven_index_repo_spring_io_release(self):
+ checkpoint_path = "maven/repo.spring.io/release/checkpoints.json"
+ maven_url = "https://repo.spring.io/artifactory/release"
+ checkpoint = pipes.get_checkpoint_from_file(
+ cloned_repo=self.checkpoint_config_repo,
+ path=checkpoint_path,
+ )
+ last_incremental = checkpoint.get("last_incremental")
+ self.log(f"last_incremental: {last_incremental}")
+ self.maven_nexus_collector = maven.MavenNexusCollector(
+ maven_url=maven_url,
+ last_incremental=last_incremental,
+ logger=self.log,
+ )
+
+ @optional_step("repo.spring.io/release")
+ def mine_and_publish_maven_packageurls_repo_spring_io_release(self):
+ _mine_and_publish_packageurls(
+ packageurls=self.maven_nexus_collector.get_packages(),
+ total_package_count=None,
+ data_cluster=self.data_cluster,
+ checked_out_repos=self.checked_out_repos,
+ working_path=self.working_path,
+ append_purls=self.append_purls,
+ commit_msg_func=self.commit_message,
+ logger=self.log,
+ )
+
+ @optional_step("repo.spring.io/release")
+ def save_check_point_repo_spring_io_release(self):
+ checkpoint_path = "maven/repo.spring.io/release/checkpoints.json"
+ last_incremental = self.maven_nexus_collector.index_properties.get(
+ "nexus.index.last-incremental"
+ )
+ checkpoint = {"last_incremental": last_incremental}
+ self.log(f"Saving checkpoint: {checkpoint}")
+ pipes.update_checkpoints_in_github(
+ checkpoint=checkpoint,
+ cloned_repo=self.checkpoint_config_repo,
+ path=checkpoint_path,
+ logger=self.log,
+ )
+
+ @optional_step("repo.spring.io/milestone")
+ def fetch_maven_index_repo_spring_io_milestone(self):
+ checkpoint_path = "maven/repo.spring.io/milestone/checkpoints.json"
+ maven_url = "https://repo.spring.io/artifactory/milestone"
+ checkpoint = pipes.get_checkpoint_from_file(
+ cloned_repo=self.checkpoint_config_repo,
+ path=checkpoint_path,
+ )
+ last_incremental = checkpoint.get("last_incremental")
+ self.log(f"last_incremental: {last_incremental}")
+ self.maven_nexus_collector = maven.MavenNexusCollector(
+ maven_url=maven_url,
+ last_incremental=last_incremental,
+ logger=self.log,
+ )
+
+ @optional_step("repo.spring.io/milestone")
+ def mine_and_publish_maven_packageurls_repo_spring_io_milestone(self):
+ _mine_and_publish_packageurls(
+ packageurls=self.maven_nexus_collector.get_packages(),
+ total_package_count=None,
+ data_cluster=self.data_cluster,
+ checked_out_repos=self.checked_out_repos,
+ working_path=self.working_path,
+ append_purls=self.append_purls,
+ commit_msg_func=self.commit_message,
+ logger=self.log,
+ )
+
+ @optional_step("repo.spring.io/milestone")
+ def save_check_point_repo_spring_io_milestone(self):
+ checkpoint_path = "maven/repo.spring.io/milestone/checkpoints.json"
+ last_incremental = self.maven_nexus_collector.index_properties.get(
+ "nexus.index.last-incremental"
+ )
+ checkpoint = {"last_incremental": last_incremental}
+ self.log(f"Saving checkpoint: {checkpoint}")
+ pipes.update_checkpoints_in_github(
+ checkpoint=checkpoint,
+ cloned_repo=self.checkpoint_config_repo,
+ path=checkpoint_path,
+ logger=self.log,
+ )
+
+ @optional_step("plugins.gradle.org")
+ def fetch_maven_index_plugins_gradle_org(self):
+ checkpoint_path = "maven/plugins.gradle.org/checkpoints.json"
+ maven_url = "https://plugins.gradle.org/m2"
+ checkpoint = pipes.get_checkpoint_from_file(
+ cloned_repo=self.checkpoint_config_repo,
+ path=checkpoint_path,
+ )
+ last_incremental = checkpoint.get("last_incremental")
+ self.log(f"last_incremental: {last_incremental}")
+ self.maven_nexus_collector = maven.MavenNexusCollector(
+ maven_url=maven_url,
+ last_incremental=last_incremental,
+ logger=self.log,
+ )
+
+ @optional_step("plugins.gradle.org")
+ def mine_and_publish_maven_packageurls_plugins_gradle_org(self):
+ _mine_and_publish_packageurls(
+ packageurls=self.maven_nexus_collector.get_packages(),
+ total_package_count=None,
+ data_cluster=self.data_cluster,
+ checked_out_repos=self.checked_out_repos,
+ working_path=self.working_path,
+ append_purls=self.append_purls,
+ commit_msg_func=self.commit_message,
+ logger=self.log,
+ )
+
+ @optional_step("plugins.gradle.org")
+ def save_check_point_plugins_gradle_org(self):
+ checkpoint_path = "maven/plugins.gradle.org/checkpoints.json"
+ last_incremental = self.maven_nexus_collector.index_properties.get(
+ "nexus.index.last-incremental"
+ )
+ checkpoint = {"last_incremental": last_incremental}
+ self.log(f"Saving checkpoint: {checkpoint}")
+ pipes.update_checkpoints_in_github(
+ checkpoint=checkpoint,
+ cloned_repo=self.checkpoint_config_repo,
+ path=checkpoint_path,
+ logger=self.log,
+ )
+
+ @optional_step("repository.apache.org")
+ def fetch_maven_index_repository_apache_org(self):
+ checkpoint_path = "maven/repository.apache.org/checkpoints.json"
+ maven_url = "https://repository.apache.org/snapshots"
+ checkpoint = pipes.get_checkpoint_from_file(
+ cloned_repo=self.checkpoint_config_repo,
+ path=checkpoint_path,
+ )
last_incremental = checkpoint.get("last_incremental")
self.log(f"last_incremental: {last_incremental}")
self.maven_nexus_collector = maven.MavenNexusCollector(
+ maven_url=maven_url,
last_incremental=last_incremental,
logger=self.log,
)
- def mine_and_publish_maven_packageurls(self):
+ @optional_step("repository.apache.org")
+ def mine_and_publish_maven_packageurls_repository_apache_org(self):
_mine_and_publish_packageurls(
packageurls=self.maven_nexus_collector.get_packages(),
total_package_count=None,
@@ -77,7 +273,9 @@ def mine_and_publish_maven_packageurls(self):
logger=self.log,
)
- def save_check_point(self):
+ @optional_step("repository.apache.org")
+ def save_check_point_repository_apache_org(self):
+ checkpoint_path = "maven/repository.apache.org/checkpoints.json"
last_incremental = self.maven_nexus_collector.index_properties.get(
"nexus.index.last-incremental"
)
@@ -86,6 +284,6 @@ def save_check_point(self):
pipes.update_checkpoints_in_github(
checkpoint=checkpoint,
cloned_repo=self.checkpoint_config_repo,
- path=self.checkpoint_path,
+ path=checkpoint_path,
logger=self.log,
)
diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py
index 076e2ac4..dc18725b 100644
--- a/minecode_pipelines/pipes/__init__.py
+++ b/minecode_pipelines/pipes/__init__.py
@@ -63,9 +63,12 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path):
def get_checkpoint_from_file(cloned_repo, path):
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
- with open(checkpoint_path) as f:
- checkpoint_data = json.load(f)
- return checkpoint_data or {}
+ try:
+ with open(checkpoint_path) as f:
+ checkpoint_data = json.load(f)
+ return checkpoint_data or {}
+ except FileNotFoundError:
+ return {}
def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None):
diff --git a/minecode_pipelines/pipes/maven.py b/minecode_pipelines/pipes/maven.py
index 4b58dfa7..6300d960 100644
--- a/minecode_pipelines/pipes/maven.py
+++ b/minecode_pipelines/pipes/maven.py
@@ -571,6 +571,7 @@ class MavenNexusCollector:
def __init__(
self,
+ maven_url=MAVEN_BASE_URL,
index_location=None,
index_properties_location=None,
last_incremental=None,
@@ -588,7 +589,7 @@ def __init__(
if index_properties_location:
self.index_properties_location = index_properties_location
else:
- index_property_download = self._fetch_index_properties()
+ index_property_download = self._fetch_index_properties(maven_url=maven_url)
self.index_properties_location = index_property_download.path
if self.index_properties_location:
@@ -600,7 +601,7 @@ def __init__(
if last_incremental:
self.index_location = None
index_increment_downloads = self._fetch_index_increments(
- last_incremental=last_incremental
+ last_incremental=last_incremental, maven_url=maven_url
)
self.index_increment_locations = [
download.path for download in index_increment_downloads
@@ -611,7 +612,7 @@ def __init__(
self.index_location = index_location
self.index_increment_locations = []
else:
- index_download = self._fetch_index()
+ index_download = self._fetch_index(maven_url=maven_url)
self.index_location = index_download.path
self.index_increment_locations = []
@@ -627,23 +628,25 @@ def _fetch_http(self, uri):
self.downloads.append(fetched)
return fetched
- def _fetch_index(self, uri=MAVEN_INDEX_URL):
+ def _fetch_index(self, maven_url=MAVEN_BASE_URL):
"""
Fetch the maven index at `uri` and return a Download with information
about where it was saved.
"""
+ uri = maven_url.rstrip("/") + "/.index/nexus-maven-repository-index.gz"
index = self._fetch_http(uri)
return index
- def _fetch_index_properties(self, uri=MAVEN_INDEX_PROPERTIES_URL):
+ def _fetch_index_properties(self, maven_url=MAVEN_BASE_URL):
"""
Fetch the maven index properties file at `uri` and return a Download
with information about where it was saved.
"""
+ uri = maven_url.rstrip("/") + "/.index/nexus-maven-repository-index.properties"
index_properties = self._fetch_http(uri)
return index_properties
- def _fetch_index_increments(self, last_incremental):
+ def _fetch_index_increments(self, last_incremental, maven_url=MAVEN_BASE_URL):
"""
Fetch maven index increments, starting past `last_incremental`, and
return a list of Downloads with information about where they were saved.
@@ -653,7 +656,10 @@ def _fetch_index_increments(self, last_incremental):
if increment_index <= last_incremental:
continue
if key.startswith("nexus.index.incremental"):
- index_increment_url = MAVEN_INDEX_INCREMENT_BASE_URL.format(index=increment_index)
+ index_increment_url = (
+ maven_url.rstrip("/")
+ + f"/.index/nexus-maven-repository-index.{increment_index}.gz"
+ )
index_increment = self._fetch_http(index_increment_url)
index_increment_downloads.append(index_increment)
return index_increment_downloads