diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index 8f0fb4d3..bf6bd360 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -476,9 +476,6 @@ def process_request(purl_str, **kwargs): collect_links = re.compile(r'href="([^"]+)"').findall -collect_links_and_artifact_timestamps = re.compile( - r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' -).findall def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): @@ -675,6 +672,62 @@ def filter_for_artifacts(timestamps_by_links): return timestamps_by_links_filtered +def collect_links_and_artifact_timestamps(text): + # Return a list of sets containing all link locations and their + # corresponding timestamps extracted from a given HTML text. + + # Pattern that matches with https://repo.maven.apache.org/maven2 + maven_apache_pattern = re.compile( + r']*>[^<]*\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' + ) + maven_apache_matches = maven_apache_pattern.findall(text) + if maven_apache_matches: + return maven_apache_matches + + # Pattern that matces with + # both Apache (UTC) and Nexus (Z) formats + # https://repository.jboss.org/nexus/service/rest/repository/browse/releases/ + # https://repository.jboss.org/nexus/service/rest/repository/browse/public/ + # https://repository.apache.org/snapshots/ + repo_jboss_apache_pattern = re.compile( + r']*>[^<]*\s*\s*((?:[A-Z][a-z]{2}\s+[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+(?:UTC|Z)\s+\d{4})| )\s*' + ) + repo_jboss_apache_matches = repo_jboss_apache_pattern.findall(text) + # Convert   to empty string for table format + if repo_jboss_apache_matches: + return [ + (item, "" if timestamp == " " else timestamp) + for item, timestamp in repo_jboss_apache_matches + ] + + # Pattern that matches with + # https://repo.spring.io/milestone + repo_spring_pattern = re.compile( + r']*>[^<]*\s+(\d{2}-[A-Z][a-z]{2}-\d{4}\s+\d{2}:\d{2})' + ) + repo_spring_matches = repo_spring_pattern.findall(text) + if repo_spring_matches: + return repo_spring_matches + + # Simple links in
 tags without timestamps (Gradle plugins format)
+    # https://plugins.gradle.org/m2/
+    plugins_gradle_pattern = re.compile(r'
]*>[^<]*
') + plugins_gradle_matches = plugins_gradle_pattern.findall(text) + if plugins_gradle_matches: + # Filter out parent directory link if present + filtered_matches = [] + for href in plugins_gradle_matches: + # Skip parent directory links + if href != "../" and not href.startswith(".."): + filtered_matches.append((href, "")) + + # Only return if we found non-parent links + if filtered_matches: + return filtered_matches + + return [] + + def collect_links_from_text(text, filter): """ Return a mapping of link locations and their timestamps, given HTML `text` @@ -700,7 +753,7 @@ def create_absolute_urls_for_links(text, url, filter): url = url.rstrip("/") timestamps_by_links = collect_links_from_text(text, filter) for link, timestamp in timestamps_by_links.items(): - if not link.startswith(url): + if not link.startswith("http:") and not link.startswith("https:"): link = f"{url}/{link}" timestamps_by_absolute_links[link] = timestamp return timestamps_by_absolute_links @@ -758,23 +811,20 @@ def get_artifact_sha1(artifact_url): return sha1 -def get_classifier_from_artifact_url( - artifact_url, package_version_page_url, package_name, package_version -): +def get_classifier_from_artifact_url(artifact_url, package_name, package_version): """ Return the classifier from a Maven artifact URL `artifact_url`, otherwise return None if a classifier cannot be determined from `artifact_url` """ classifier = None - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 - package_version_page_url = package_version_page_url.rstrip("/") - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 - leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}" + package_name_version_portion = f"{package_name}-{package_version}" + artifact_url_filename = artifact_url.rsplit("/", 1)[-1] + remaining_url_portion = artifact_url_filename.replace(package_name_version_portion, "") # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' - # ['', '-onejar.jar'] - _, remaining_url_portion = artifact_url.split(leading_url_portion) - # ['-onejar', 'jar'] + # artifact_url_filename = 'livereload-jvm-0.2.0-onejar.jar' + # remaining_url_portion = '-onejar.jar' remaining_url_portions = remaining_url_portion.split(".") + # ['-onejar', 'jar'] if remaining_url_portions and remaining_url_portions[0]: # '-onejar' classifier = remaining_url_portions[0] diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index ed754e31..f50895fa 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -132,12 +132,14 @@ def process_request(importable_uri): timestamps_by_artifact_links = get_artifact_links(version_page_url) for artifact_link, timestamp in timestamps_by_artifact_links.items(): sha1 = get_artifact_sha1(artifact_link) - classifier = get_classifier_from_artifact_url( - artifact_link, version_page_url, name, version - ) + classifier = get_classifier_from_artifact_url(artifact_link, name, version) qualifiers = None if classifier: qualifiers = f"classifier={classifier}" + if timestamp: + release_date = dateutil_parse(timestamp) + else: + release_date = None release_date = dateutil_parse(timestamp) package_data = PackageData( type="maven", diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py index df6da9cf..baf1fbc7 100644 --- a/minecode/management/commands/maven_crawler.py +++ b/minecode/management/commands/maven_crawler.py @@ -26,5 +26,15 @@ class Command(VerboseCommand): help = "Run a Package request queue." def handle(self, *args, **options): - maven_root_url = "https://repo.maven.apache.org/maven2" - crawl_maven_repo_from_root(root_url=maven_root_url) + # Add the maven root URLs + # Ref: https://github.com/aboutcode-org/purldb/issues/630#issuecomment-3599942716 + maven_root_urls = [ + "https://repo.maven.apache.org/maven2", + "https://repo.spring.io/artifactory/milestone", + "https://plugins.gradle.org/m2", + "https://repository.apache.org/content/groups/snapshots", + "https://repository.jboss.org/nexus/service/rest/repository/browse/releases", + "https://repository.jboss.org/nexus/service/rest/repository/browse/public", + ] + for maven_root_url in maven_root_urls: + crawl_maven_repo_from_root(root_url=maven_root_url) diff --git a/minecode/tests/collectors/test_maven.py b/minecode/tests/collectors/test_maven.py index 541bf28b..05d31f6c 100644 --- a/minecode/tests/collectors/test_maven.py +++ b/minecode/tests/collectors/test_maven.py @@ -201,7 +201,7 @@ def test_get_merged_ancestor_package_from_maven_package( class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") + test_data_dir = os.path.join(os.path.dirname(__file__), "../testfiles") def test_check_if_file_name_is_linked_on_page(self): links = ["foo/", "bar/", "baz/"] @@ -500,12 +500,80 @@ def test_get_artifact_sha1(self, mock_request_get): def test_get_classifier_from_artifact_url(self): artifact_url = "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar" - package_version_page_url = ( - "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/" - ) package_name = "livereload-jvm" package_version = "0.2.0" classifier = maven.get_classifier_from_artifact_url( - artifact_url, package_version_page_url, package_name, package_version + artifact_url, package_name, package_version ) self.assertEqual("onejar", classifier) + + def test_collect_links_and_artifact_timestamps_repo_maven_apache_org(self): + # https://repo.maven.apache.org/maven2 + with open(self.get_test_loc("maven/html/maven.apache.org/abbot.html")) as file: + text = file.read() + expected = [ + ("1.4.0/", "2015-09-22 16:03"), + ("maven-metadata.xml", "2015-09-24 14:18"), + ] + + self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text)) + + def test_collect_links_and_artifact_timestamps_repository_jboss_org(self): + # https://repository.jboss.org/nexus/service/rest/repository/browse/public/ + # https://repository.jboss.org/nexus/service/rest/repository/browse/releases/ + with open(self.get_test_loc("maven/html/repository.jboss.org/commons-codec.html")) as file: + text = file.read() + expected = [ + ("1.2/", ""), + ( + "https://repository.jboss.org/nexus/repository/public/apache-codec/commons-codec/maven-metadata.xml", + "Fri Sep 05 09:38:07 Z 2025", + ), + ] + + self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text)) + + def test_collect_links_and_artifact_timestamps_repository_apache_org(self): + # https://repository.apache.org/snapshots/ + with open(self.get_test_loc("maven/html/repository.apache.org/common-chain.html")) as file: + text = file.read() + expected = [ + ( + "https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/1.3-SNAPSHOT/", + "Thu Jul 04 05:45:00 UTC 2013", + ), + ( + "https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/2.0-SNAPSHOT/", + "Tue Aug 21 20:26:48 UTC 2018", + ), + ( + "https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/maven-metadata.xml.md5", + "Tue Aug 21 20:26:47 UTC 2018", + ), + ( + "https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/maven-metadata.xml.sha1", + "Tue Aug 21 20:26:47 UTC 2018", + ), + ] + + self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text)) + + def test_collect_links_and_artifact_timestamps_repo_spring_io(self): + # https://repo.spring.io/release + with open(self.get_test_loc("maven/html/repo.spring.io/scstest.html")) as file: + text = file.read() + expected = [ + ("0.0.11.M2/", "07-Aug-2019 08:40"), + ("0.0.11.RC2/", "07-Aug-2019 08:36"), + ("maven-metadata.xml", "07-Aug-2019 09:07"), + ] + + self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text)) + + def test_collect_links_and_artifact_timestamps_plugin_gradle_org(self): + # https://plugins.gradle.org/m2/ + with open(self.get_test_loc("maven/html/plugins.gradle.org/test.html")) as file: + text = file.read() + expected = [("0.0.10/", ""), ("1.0.1/", ""), ("1.1.0/", ""), ("maven-metadata.xml", "")] + + self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text)) diff --git a/minecode/tests/testfiles/maven/html/maven.apache.org/abbot.html b/minecode/tests/testfiles/maven/html/maven.apache.org/abbot.html new file mode 100644 index 00000000..7b2ce8fd --- /dev/null +++ b/minecode/tests/testfiles/maven/html/maven.apache.org/abbot.html @@ -0,0 +1,29 @@ + + + + + Central Repository: abbot/abbot + + + + + +
+

abbot/abbot

+
+
+
+
+../
+1.4.0/                                            2015-09-22 16:03         -
+maven-metadata.xml                                2015-09-24 14:18       402
+		
+
+
+ + + diff --git a/minecode/tests/testfiles/maven/html/plugins.gradle.org/test.html b/minecode/tests/testfiles/maven/html/plugins.gradle.org/test.html new file mode 100644 index 00000000..b4df3484 --- /dev/null +++ b/minecode/tests/testfiles/maven/html/plugins.gradle.org/test.html @@ -0,0 +1,10 @@ + + + + +
0.0.10/
+
1.0.1/
+
1.1.0/
+
maven-metadata.xml
+ + diff --git a/minecode/tests/testfiles/maven/html/repo.spring.io/scstest.html b/minecode/tests/testfiles/maven/html/repo.spring.io/scstest.html new file mode 100644 index 00000000..b0c8c1f4 --- /dev/null +++ b/minecode/tests/testfiles/maven/html/repo.spring.io/scstest.html @@ -0,0 +1,14 @@ + + + +Index of milestone/com/albertoimpl/test/scstest/releasetest + + +

Index of milestone/com/albertoimpl/test/scstest/releasetest

+
Name                Last modified      Size

+
../
+0.0.11.M2/           07-Aug-2019 08:40    -
+0.0.11.RC2/          07-Aug-2019 08:36    -
+maven-metadata.xml   07-Aug-2019 09:07  449 bytes
+
+
Artifactory Online Server
diff --git a/minecode/tests/testfiles/maven/html/repository.apache.org/common-chain.html b/minecode/tests/testfiles/maven/html/repository.apache.org/common-chain.html new file mode 100644 index 00000000..29937019 --- /dev/null +++ b/minecode/tests/testfiles/maven/html/repository.apache.org/common-chain.html @@ -0,0 +1,59 @@ + + + Index of /groups/snapshots/commons-chain/commons-chain + + + + + + + + +

Index of /groups/snapshots/commons-chain/commons-chain

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameLast ModifiedSizeDescription
Parent Directory
1.3-SNAPSHOT/Thu Jul 04 05:45:00 UTC 2013 +   +
2.0-SNAPSHOT/Tue Aug 21 20:26:48 UTC 2018 +   +
maven-metadata.xml.md5Tue Aug 21 20:26:47 UTC 2018 + 33 +
maven-metadata.xml.sha1Tue Aug 21 20:26:47 UTC 2018 + 41 +
+ + diff --git a/minecode/tests/testfiles/maven/html/repository.jboss.org/commons-codec.html b/minecode/tests/testfiles/maven/html/repository.jboss.org/commons-codec.html new file mode 100644 index 00000000..483c331c --- /dev/null +++ b/minecode/tests/testfiles/maven/html/repository.jboss.org/commons-codec.html @@ -0,0 +1,50 @@ + + + + Index of /apache-codec/commons-codec + + + + + + + + + + +

Index of /apache-codec/commons-codec

+ + + + + + + + + + + + + + + + + + + + + + + + +
NameLast ModifiedSizeDescription
Parent Directory
1.2 +   + +   +
maven-metadata.xml + Fri Sep 05 09:38:07 Z 2025 + + 347 +
+ + diff --git a/minecode_pipelines/pipelines/mine_maven.py b/minecode_pipelines/pipelines/mine_maven.py index e11c7b6c..20ddcdf7 100644 --- a/minecode_pipelines/pipelines/mine_maven.py +++ b/minecode_pipelines/pipelines/mine_maven.py @@ -20,6 +20,7 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +from aboutcode.pipeline import optional_step from scanpipe.pipes import federatedcode from minecode_pipelines import pipes @@ -32,7 +33,7 @@ class MineMaven(MineCodeBasePipeline): """Mine PackageURLs from maven index and publish them to FederatedCode.""" pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/" - checkpoint_path = "maven/checkpoints.json" + append_purls = True @classmethod @@ -41,31 +42,226 @@ def steps(cls): cls.check_federatedcode_eligibility, cls.create_federatedcode_working_dir, cls.fetch_federation_config, - cls.fetch_checkpoint_and_maven_index, - cls.mine_and_publish_maven_packageurls, - cls.save_check_point, + cls.fetch_checkpoint_config_repo, + cls.fetch_maven_index_repo1_maven_org, + cls.mine_and_publish_maven_packageurls_repo1_maven_org, + cls.save_check_point_repo1_maven_org, + cls.fetch_maven_index_repo_spring_io_release, + cls.mine_and_publish_maven_packageurls_repo_spring_io_release, + cls.save_check_point_repo_spring_io_release, + cls.fetch_maven_index_repo_spring_io_milestone, + cls.mine_and_publish_maven_packageurls_repo_spring_io_milestone, + cls.save_check_point_repo_spring_io_milestone, + cls.fetch_maven_index_plugins_gradle_org, + cls.mine_and_publish_maven_packageurls_plugins_gradle_org, + cls.save_check_point_plugins_gradle_org, + cls.fetch_maven_index_repository_apache_org, + cls.mine_and_publish_maven_packageurls_repository_apache_org, + cls.save_check_point_repository_apache_org, cls.delete_working_dir, ) - def fetch_checkpoint_and_maven_index(self): + def fetch_checkpoint_config_repo(self): self.checkpoint_config_repo = federatedcode.clone_repository( repo_url=self.pipeline_config_repo, clone_path=self.working_path / "minecode-pipelines-config", logger=self.log, ) + + @optional_step("repo1.maven.org") + def fetch_maven_index_repo1_maven_org(self): + checkpoint_path = "maven/repo.maven.org/checkpoints.json" + maven_url = "https://repo1.maven.org/maven2" checkpoint = pipes.get_checkpoint_from_file( cloned_repo=self.checkpoint_config_repo, - path=self.checkpoint_path, + path=checkpoint_path, + ) + last_incremental = checkpoint.get("last_incremental") + self.log(f"last_incremental: {last_incremental}") + self.maven_nexus_collector = maven.MavenNexusCollector( + maven_url=maven_url, + last_incremental=last_incremental, + logger=self.log, ) + @optional_step("repo1.maven.org") + def mine_and_publish_maven_packageurls_repo1_maven_org(self): + _mine_and_publish_packageurls( + packageurls=self.maven_nexus_collector.get_packages(), + total_package_count=None, + data_cluster=self.data_cluster, + checked_out_repos=self.checked_out_repos, + working_path=self.working_path, + append_purls=self.append_purls, + commit_msg_func=self.commit_message, + logger=self.log, + ) + + @optional_step("repo1.maven.org") + def save_check_point_repo1_maven_org(self): + checkpoint_path = "maven/repo.maven.org/checkpoints.json" + last_incremental = self.maven_nexus_collector.index_properties.get( + "nexus.index.last-incremental" + ) + checkpoint = {"last_incremental": last_incremental} + self.log(f"Saving checkpoint: {checkpoint}") + pipes.update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + logger=self.log, + ) + + @optional_step("repo.spring.io/release") + def fetch_maven_index_repo_spring_io_release(self): + checkpoint_path = "maven/repo.spring.io/release/checkpoints.json" + maven_url = "https://repo.spring.io/artifactory/release" + checkpoint = pipes.get_checkpoint_from_file( + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + ) + last_incremental = checkpoint.get("last_incremental") + self.log(f"last_incremental: {last_incremental}") + self.maven_nexus_collector = maven.MavenNexusCollector( + maven_url=maven_url, + last_incremental=last_incremental, + logger=self.log, + ) + + @optional_step("repo.spring.io/release") + def mine_and_publish_maven_packageurls_repo_spring_io_release(self): + _mine_and_publish_packageurls( + packageurls=self.maven_nexus_collector.get_packages(), + total_package_count=None, + data_cluster=self.data_cluster, + checked_out_repos=self.checked_out_repos, + working_path=self.working_path, + append_purls=self.append_purls, + commit_msg_func=self.commit_message, + logger=self.log, + ) + + @optional_step("repo.spring.io/release") + def save_check_point_repo_spring_io_release(self): + checkpoint_path = "maven/repo.spring.io/release/checkpoints.json" + last_incremental = self.maven_nexus_collector.index_properties.get( + "nexus.index.last-incremental" + ) + checkpoint = {"last_incremental": last_incremental} + self.log(f"Saving checkpoint: {checkpoint}") + pipes.update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + logger=self.log, + ) + + @optional_step("repo.spring.io/milestone") + def fetch_maven_index_repo_spring_io_milestone(self): + checkpoint_path = "maven/repo.spring.io/milestone/checkpoints.json" + maven_url = "https://repo.spring.io/artifactory/milestone" + checkpoint = pipes.get_checkpoint_from_file( + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + ) + last_incremental = checkpoint.get("last_incremental") + self.log(f"last_incremental: {last_incremental}") + self.maven_nexus_collector = maven.MavenNexusCollector( + maven_url=maven_url, + last_incremental=last_incremental, + logger=self.log, + ) + + @optional_step("repo.spring.io/milestone") + def mine_and_publish_maven_packageurls_repo_spring_io_milestone(self): + _mine_and_publish_packageurls( + packageurls=self.maven_nexus_collector.get_packages(), + total_package_count=None, + data_cluster=self.data_cluster, + checked_out_repos=self.checked_out_repos, + working_path=self.working_path, + append_purls=self.append_purls, + commit_msg_func=self.commit_message, + logger=self.log, + ) + + @optional_step("repo.spring.io/milestone") + def save_check_point_repo_spring_io_milestone(self): + checkpoint_path = "maven/repo.spring.io/milestone/checkpoints.json" + last_incremental = self.maven_nexus_collector.index_properties.get( + "nexus.index.last-incremental" + ) + checkpoint = {"last_incremental": last_incremental} + self.log(f"Saving checkpoint: {checkpoint}") + pipes.update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + logger=self.log, + ) + + @optional_step("plugins.gradle.org") + def fetch_maven_index_plugins_gradle_org(self): + checkpoint_path = "maven/plugins.gradle.org/checkpoints.json" + maven_url = "https://plugins.gradle.org/m2" + checkpoint = pipes.get_checkpoint_from_file( + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + ) + last_incremental = checkpoint.get("last_incremental") + self.log(f"last_incremental: {last_incremental}") + self.maven_nexus_collector = maven.MavenNexusCollector( + maven_url=maven_url, + last_incremental=last_incremental, + logger=self.log, + ) + + @optional_step("plugins.gradle.org") + def mine_and_publish_maven_packageurls_plugins_gradle_org(self): + _mine_and_publish_packageurls( + packageurls=self.maven_nexus_collector.get_packages(), + total_package_count=None, + data_cluster=self.data_cluster, + checked_out_repos=self.checked_out_repos, + working_path=self.working_path, + append_purls=self.append_purls, + commit_msg_func=self.commit_message, + logger=self.log, + ) + + @optional_step("plugins.gradle.org") + def save_check_point_plugins_gradle_org(self): + checkpoint_path = "maven/plugins.gradle.org/checkpoints.json" + last_incremental = self.maven_nexus_collector.index_properties.get( + "nexus.index.last-incremental" + ) + checkpoint = {"last_incremental": last_incremental} + self.log(f"Saving checkpoint: {checkpoint}") + pipes.update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + logger=self.log, + ) + + @optional_step("repository.apache.org") + def fetch_maven_index_repository_apache_org(self): + checkpoint_path = "maven/repository.apache.org/checkpoints.json" + maven_url = "https://repository.apache.org/snapshots" + checkpoint = pipes.get_checkpoint_from_file( + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + ) last_incremental = checkpoint.get("last_incremental") self.log(f"last_incremental: {last_incremental}") self.maven_nexus_collector = maven.MavenNexusCollector( + maven_url=maven_url, last_incremental=last_incremental, logger=self.log, ) - def mine_and_publish_maven_packageurls(self): + @optional_step("repository.apache.org") + def mine_and_publish_maven_packageurls_repository_apache_org(self): _mine_and_publish_packageurls( packageurls=self.maven_nexus_collector.get_packages(), total_package_count=None, @@ -77,7 +273,9 @@ def mine_and_publish_maven_packageurls(self): logger=self.log, ) - def save_check_point(self): + @optional_step("repository.apache.org") + def save_check_point_repository_apache_org(self): + checkpoint_path = "maven/repository.apache.org/checkpoints.json" last_incremental = self.maven_nexus_collector.index_properties.get( "nexus.index.last-incremental" ) @@ -86,6 +284,6 @@ def save_check_point(self): pipes.update_checkpoints_in_github( checkpoint=checkpoint, cloned_repo=self.checkpoint_config_repo, - path=self.checkpoint_path, + path=checkpoint_path, logger=self.log, ) diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index 076e2ac4..dc18725b 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -63,9 +63,12 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path): def get_checkpoint_from_file(cloned_repo, path): checkpoint_path = os.path.join(cloned_repo.working_dir, path) - with open(checkpoint_path) as f: - checkpoint_data = json.load(f) - return checkpoint_data or {} + try: + with open(checkpoint_path) as f: + checkpoint_data = json.load(f) + return checkpoint_data or {} + except FileNotFoundError: + return {} def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None): diff --git a/minecode_pipelines/pipes/maven.py b/minecode_pipelines/pipes/maven.py index 4b58dfa7..6300d960 100644 --- a/minecode_pipelines/pipes/maven.py +++ b/minecode_pipelines/pipes/maven.py @@ -571,6 +571,7 @@ class MavenNexusCollector: def __init__( self, + maven_url=MAVEN_BASE_URL, index_location=None, index_properties_location=None, last_incremental=None, @@ -588,7 +589,7 @@ def __init__( if index_properties_location: self.index_properties_location = index_properties_location else: - index_property_download = self._fetch_index_properties() + index_property_download = self._fetch_index_properties(maven_url=maven_url) self.index_properties_location = index_property_download.path if self.index_properties_location: @@ -600,7 +601,7 @@ def __init__( if last_incremental: self.index_location = None index_increment_downloads = self._fetch_index_increments( - last_incremental=last_incremental + last_incremental=last_incremental, maven_url=maven_url ) self.index_increment_locations = [ download.path for download in index_increment_downloads @@ -611,7 +612,7 @@ def __init__( self.index_location = index_location self.index_increment_locations = [] else: - index_download = self._fetch_index() + index_download = self._fetch_index(maven_url=maven_url) self.index_location = index_download.path self.index_increment_locations = [] @@ -627,23 +628,25 @@ def _fetch_http(self, uri): self.downloads.append(fetched) return fetched - def _fetch_index(self, uri=MAVEN_INDEX_URL): + def _fetch_index(self, maven_url=MAVEN_BASE_URL): """ Fetch the maven index at `uri` and return a Download with information about where it was saved. """ + uri = maven_url.rstrip("/") + "/.index/nexus-maven-repository-index.gz" index = self._fetch_http(uri) return index - def _fetch_index_properties(self, uri=MAVEN_INDEX_PROPERTIES_URL): + def _fetch_index_properties(self, maven_url=MAVEN_BASE_URL): """ Fetch the maven index properties file at `uri` and return a Download with information about where it was saved. """ + uri = maven_url.rstrip("/") + "/.index/nexus-maven-repository-index.properties" index_properties = self._fetch_http(uri) return index_properties - def _fetch_index_increments(self, last_incremental): + def _fetch_index_increments(self, last_incremental, maven_url=MAVEN_BASE_URL): """ Fetch maven index increments, starting past `last_incremental`, and return a list of Downloads with information about where they were saved. @@ -653,7 +656,10 @@ def _fetch_index_increments(self, last_incremental): if increment_index <= last_incremental: continue if key.startswith("nexus.index.incremental"): - index_increment_url = MAVEN_INDEX_INCREMENT_BASE_URL.format(index=increment_index) + index_increment_url = ( + maven_url.rstrip("/") + + f"/.index/nexus-maven-repository-index.{increment_index}.gz" + ) index_increment = self._fetch_http(index_increment_url) index_increment_downloads.append(index_increment) return index_increment_downloads