Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions vulnerabilities/importers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@
from vulnerabilities.pipelines.v2_importers import nvd_importer as nvd_importer_v2
from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2
from vulnerabilities.pipelines.v2_importers import postgresql_importer as postgresql_importer_v2
from vulnerabilities.pipelines.v2_importers import (
project_kb_msr2019_importer as project_kb_msr2019_importer_v2,
)
from vulnerabilities.pipelines.v2_importers import (
project_kb_statements_importer as project_kb_statements_importer_v2,
)
from vulnerabilities.pipelines.v2_importers import pypa_importer as pypa_importer_v2
from vulnerabilities.pipelines.v2_importers import pysec_importer as pysec_importer_v2
from vulnerabilities.pipelines.v2_importers import redhat_importer as redhat_importer_v2
Expand Down Expand Up @@ -83,6 +89,8 @@
github_osv_importer_v2.GithubOSVImporterPipeline,
redhat_importer_v2.RedHatImporterPipeline,
aosp_importer_v2.AospImporterPipeline,
project_kb_statements_importer_v2.ProjectKBStatementsPipeline,
project_kb_msr2019_importer_v2.ProjectKBMSR2019Pipeline,
nvd_importer.NVDImporterPipeline,
github_importer.GitHubAPIImporterPipeline,
gitlab_importer.GitLabImporterPipeline,
Expand Down
23 changes: 5 additions & 18 deletions vulnerabilities/pipelines/v2_importers/aosp_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,9 @@
from fetchcode.vcs import fetch_via_vcs

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import AffectedPackageV2
from vulnerabilities.importer import PackageCommitPatchData
from vulnerabilities.importer import PatchData
from vulnerabilities.importer import ReferenceV2
from vulnerabilities.importer import VulnerabilitySeverity
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
from vulnerabilities.pipes.advisory import classify_patch_source
from vulnerabilities.pipes.advisory import append_patch_classifications
from vulnerabilities.severity_systems import GENERIC


Expand Down Expand Up @@ -90,23 +86,14 @@ def collect_advisories(self):
patch_url = commit_data.get("patchUrl")
commit_id = commit_data.get("commitId")

base_purl, patch_objs = classify_patch_source(
append_patch_classifications(
url=patch_url,
commit_hash=commit_id,
patch_text=None,
affected_packages=affected_packages,
references=references,
patches=patches,
)
for patch_obj in patch_objs:
if isinstance(patch_obj, PackageCommitPatchData):
fixed_commit = patch_obj
affected_package = AffectedPackageV2(
package=base_purl,
fixed_by_commit_patches=[fixed_commit],
)
affected_packages.append(affected_package)
elif isinstance(patch_obj, PatchData):
patches.append(patch_obj)
elif isinstance(patch_obj, ReferenceV2):
references.append(patch_obj)

url = (
"https://raw.githubusercontent.com/quarkslab/aosp_dataset/refs/heads/master/cves/"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import csv
from pathlib import Path
from typing import Iterable

from fetchcode.vcs import fetch_via_vcs

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
from vulnerabilities.pipes.advisory import append_patch_classifications


class ProjectKBMSR2019Pipeline(VulnerableCodeBaseImporterPipelineV2):
"""
ProjectKB Importer Pipeline
Collect advisory from ProjectKB data:
- CSV database https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv
"""

pipeline_id = "project-kb-MSR-2019_v2"
spdx_license_expression = "Apache-2.0"
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
repo_url = "git+https://github.com/SAP/project-kb"

@classmethod
def steps(cls):
return (cls.clone_repo, cls.collect_and_store_advisories, cls.clean_downloads)

def clone_repo(self):
self.log("Cloning ProjectKB advisory data...")
self.vcs_response = fetch_via_vcs(self.repo_url)

def advisories_count(self):
csv_path = Path(self.vcs_response.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"

with open(csv_path, newline="", encoding="utf-8") as f:
reader = csv.reader(f)
next(reader, None)
count = sum(1 for _ in reader)

self.log(f"Estimated advisories to process: {count}")
return count

def collect_advisories(self) -> Iterable[AdvisoryData]:
self.log("Collecting fix commits from ProjectKB ( vulas_db_msr2019_release )...")
csv_path = Path(self.vcs_response.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"

with open(csv_path, newline="", encoding="utf-8") as f:
reader = csv.reader(f)
next(reader, None) # skip header

for row in reader:
if len(row) != 4:
continue

vuln_id, vcs_url, commit_hash, poc = row

if not vuln_id or not vcs_url or not commit_hash:
continue

patches = []
affected_packages = []
references = []
append_patch_classifications(
url=vcs_url,
commit_hash=commit_hash,
patch_text=None,
affected_packages=affected_packages,
references=references,
patches=patches,
)

yield AdvisoryData(
advisory_id=vuln_id,
affected_packages=affected_packages,
patches=patches,
references_v2=references,
url="https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv",
)

def clean_downloads(self):
"""Remove the cloned repository from disk."""
self.log("Removing cloned repository...")
if self.vcs_response:
self.vcs_response.delete()

def on_failure(self):
"""Ensure cleanup happens on pipeline failure."""
self.clean_downloads()
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from collections import defaultdict
from pathlib import Path
from typing import Iterable

import saneyaml
from fetchcode.vcs import fetch_via_vcs
from packageurl import PackageURL
from univers.version_range import RANGE_CLASS_BY_SCHEMES
from univers.versions import InvalidVersion

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import AffectedPackageV2
from vulnerabilities.importer import ReferenceV2
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
from vulnerabilities.pipes.advisory import append_patch_classifications
from vulnerabilities.utils import get_advisory_url
from vulnerabilities.utils import is_commit


class ProjectKBStatementsPipeline(VulnerableCodeBaseImporterPipelineV2):
"""
ProjectKB Importer Pipeline
Collect advisory from ProjectKB data:
- YAML statements: https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml
"""

pipeline_id = "project-kb-statements_v2"
spdx_license_expression = "Apache-2.0"
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
repo_url = "git+https://github.com/SAP/project-kb@vulnerability-data"

@classmethod
def steps(cls):
return (cls.clone_repo, cls.collect_and_store_advisories, cls.clean_downloads)

def clone_repo(self):
self.log("Cloning ProjectKB Statements advisory data...")
self.vcs_response = fetch_via_vcs(self.repo_url)

def advisories_count(self):
base_path = Path(self.vcs_response.dest_dir) / "statements"
count = sum(1 for _ in base_path.rglob("*.yaml"))
self.log(f"Estimated advisories to process: {count}")
return count

def collect_advisories(self) -> Iterable[AdvisoryData]:
self.log("Collecting fix commits from YAML statements under /statements....")
base_path = Path(self.vcs_response.dest_dir) / "statements"

for yaml_file in base_path.rglob("*.yaml"):
if yaml_file.name != "statement.yaml":
continue

with open(yaml_file, encoding="utf-8") as f:
yaml_data = saneyaml.load(f)

vulnerability_id = yaml_data.get("vulnerability_id")
if not vulnerability_id:
continue

note_texts = []
references = []
for note_entry in yaml_data.get("notes", []):
text_content = note_entry.get("text")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notes section may also contain list of links which should be collected and stored as references see example https://github.com/SAP/project-kb/blob/730d2192bfa9b909246e02bc051e4fad2958a6d9/statements/CVE-2018-16406/statement.yaml

if not text_content:
continue
note_texts.append(text_content)

for link in note_entry.get("links", []):
ref = ReferenceV2(url=link)
references.append(ref)

description = "\n".join(note_texts)
affected_packages = []
patches = []
for fix in yaml_data.get("fixes", []):
for commit in fix.get("commits", []):
commit_hash = commit.get("id")
if not is_commit(commit_hash):
commit_hash = None

vcs_url = commit.get("repository")
append_patch_classifications(
url=vcs_url,
commit_hash=commit_hash,
patch_text=None,
affected_packages=affected_packages,
references=references,
patches=patches,
)

purls_to_versions = defaultdict(lambda: [[], []])
for artifact in yaml_data.get("artifacts", []):
affected = artifact.get("affected")
purl_str = artifact.get("id")

try:
purl = PackageURL.from_string(purl_str)
except ValueError:
self.log(f"Invalid PackageURL: {purl_str!r}")
continue

version_range_class = RANGE_CLASS_BY_SCHEMES.get(purl.type)
if not version_range_class:
continue

base_purl = PackageURL(
type=purl.type,
namespace=purl.namespace,
name=purl.name,
)

if affected:
purls_to_versions[base_purl][0].append(purl.version)
else:
purls_to_versions[base_purl][1].append(purl.version)

for base_purl, (affected_versions, fixed_versions) in purls_to_versions.items():
version_range_class = RANGE_CLASS_BY_SCHEMES.get(base_purl.type)

affected_range = None
fixed_range = None

if affected_versions:
try:
affected_range = version_range_class.from_versions(affected_versions)
except InvalidVersion as e:
self.log(f"Invalid affected versions for {base_purl}: {e}")

if fixed_versions:
try:
fixed_range = version_range_class.from_versions(fixed_versions)
except InvalidVersion as e:
self.log(f"Invalid fixed versions for {base_purl}: {e}")

if affected_range or fixed_range:
pkg = AffectedPackageV2(
package=base_purl,
affected_version_range=affected_range,
fixed_version_range=fixed_range,
)
affected_packages.append(pkg)

advisory_url = get_advisory_url(
file=yaml_file,
base_path=base_path,
url="https://github.com/SAP/project-kb/blob/vulnerability-data/statements/",
)

yield AdvisoryData(
advisory_id=vulnerability_id,
summary=description,
affected_packages=affected_packages,
references_v2=references,
patches=patches,
url=advisory_url,
)

def clean_downloads(self):
"""Remove the cloned repository from disk."""
self.log("Removing cloned repository...")

if self.vcs_response:
self.vcs_response.delete()

def on_failure(self):
"""Ensure cleanup happens on pipeline failure."""
self.clean_downloads()
25 changes: 25 additions & 0 deletions vulnerabilities/pipes/advisory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from aboutcode.hashid import get_core_purl
from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import AffectedPackageV2
from vulnerabilities.importer import PackageCommitPatchData
from vulnerabilities.importer import PatchData
from vulnerabilities.importer import ReferenceV2
Expand Down Expand Up @@ -500,3 +501,27 @@ def advisories_checksum(advisories: Union[Advisory, List[Advisory]]) -> str:

checksum = hashlib.sha1(combined_contents.encode())
return checksum.hexdigest()


def append_patch_classifications(
url, commit_hash, patch_text, affected_packages, patches, references
):
"""Classify a patch source and append the results to affected_packages, patches, or references,
assuming all provided commits are fixed commits."""

base_purl, patch_objs = classify_patch_source(
url=url, commit_hash=commit_hash, patch_text=patch_text
)

for patch_obj in patch_objs:
if isinstance(patch_obj, PackageCommitPatchData):
fixed_commit = patch_obj
affected_package = AffectedPackageV2(
package=base_purl,
fixed_by_commit_patches=[fixed_commit],
)
affected_packages.append(affected_package)
elif isinstance(patch_obj, PatchData):
patches.append(patch_obj)
elif isinstance(patch_obj, ReferenceV2):
references.append(patch_obj)
Loading
Loading