diff --git a/README.md b/README.md index 9868666..b8c9cf5 100644 --- a/README.md +++ b/README.md @@ -43,16 +43,87 @@ You can create your own knowledgebase with the minr command, available at https: Syntax: scanoss [parameters] [TARGET] -Configuration: -* -w Treats TARGET as a .wfp file regardless of the actual file extension -* -s FILE Use assets specified in the provided JSON SBOM (CycloneDX/SPDX2.2 JSON format) as input to identification -* -b FILE Ignore matches to assets specified in the provided JSON SBOM (CycloneDX/SPDX2.2 JSON format) - -Options: -* -t Tests engine performance -* -v Display version and exit -* -h Display this help and exit -* -d Enable debugging information +## Configuration Options + +### Basic Configuration +* `-w, --wfp` - Process TARGET as a .wfp file, regardless of its actual extension +* `-H, --hpsm` - Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system) +* `-M, --max-snippets NUM` - Search for up to NUM different components in each file (maximum: 9) +* `-N, --max-components NUM` - Set maximum number of components (default: 5) +* `-T, --tolerance NUM` - Set snippet scanning tolerance percentage (default: 0.1) +* `-r, --rank NUM` - Set maximum component rank accepted (default: 11) +* `--max-files NUM` - Set maximum number of files to fetch during matching (default: 12000) +* `--min-match-hits NUM` - Set minimum snippet ID hits for a match (default: 3, disables auto-adjust) +* `--min-match-lines NUM` - Set minimum matched lines for a range (default: 10, disables auto-adjust) +* `--range-tolerance NUM` - Set max non-matched lines tolerated in a range (default: 5) +* `--ignore-file-ext` - Ignore file extension during snippet matching (default: honor extension) + +### SBOM and Filtering +* `-s, --sbom FILE` - Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification +* `-b, --blacklist FILE` - Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format) +* `--force-snippet` - Same as "-b" but with forced snippet scanning +* `-c, --component HINT` - Add a component HINT to guide scan results + +### Attribution and Licenses +* `-a, --attribution FILE` - Show attribution notices for the provided SBOM.json file +* `-k, --key KEY` - Show contents of the specified KEY file from MZ sources archive +* `-l, --license LICENSE` - Display OSADL metadata for the given SPDX license ID +* `-L, --full-license` - Enable full license report +* `-F, --flags FLAGS` - Set engine scanning flags (see Engine Flags section below) + +### General Options +* `-t, --test` - Run engine performance tests +* `-v, --version` - Show version information and exit +* `-n, --name NAME` - Set database name (default: oss) +* `-h, --help` - Display help information and exit +* `-d, --debug` - Store debugging information to disk (/tmp) +* `-q, --quiet` - Suppress JSON output (show only debugging info via STDERR) + +## Environment Variables + +* `SCANOSS_MATCHMAP_MAX` - Set the snippet scanning match map size (default: 10000) +* `SCANOSS_FILE_CONTENTS_URL` - Define the API URL endpoint for sources. Source URL won't be reported if not defined + +## Engine Scanning Flags + +Configure the scanning engine using flags with the `-F/--flags` parameter. These settings can also be specified in `/etc/scanoss_flags.cfg` + +| Flag | Setting | +|-------|-------------------------------------------------------| +| 1 | Disable snippet matching (default: enabled) | +| 2 | Enable snippet_ids (default: disabled) | +| 4 | Disable dependencies (default: enabled) | +| 8 | Disable licenses (default: enabled) | +| 16 | Disable copyrights (default: enabled) | +| 32 | Disable vulnerabilities (default: enabled) | +| 64 | Disable quality (default: enabled) | +| 128 | Disable cryptography (default: enabled) | +| 256 | Disable best match only (default: enabled) | +| 512 | Hide identified files (default: disabled) | +| 1024 | Enable download_url (default: disabled) | +| 2048 | Enable "use path hint" logic (default: disabled) | +| 4096 | Disable extended server stats (default: enabled) | +| 8192 | Disable health layer (default: enabled) | +| 16384 | Enable high accuracy, slower scan (default: disabled) | + +### Examples: +```bash +# Scan DIRECTORY without license and dependency data +scanoss -F 12 DIRECTORY +scanoss --flags 12 DIRECTORY + +# Scan TARGET including SBOM assets +scanoss --sbom my_sbom.json TARGET + +# Scan with custom snippet matching parameters +scanoss --min-match-hits 5 --min-match-lines 15 TARGET + +# Scan with custom range tolerance +scanoss --range-tolerance 10 TARGET + +# Ignore file extensions during matching +scanoss --ignore-file-ext TARGET +``` # File matching logic diff --git a/inc/component.h b/inc/component.h index 5c73f46..9748c10 100644 --- a/inc/component.h +++ b/inc/component.h @@ -2,6 +2,7 @@ #define __COMPONENT_H #include "scanoss.h" +#include "limits.h" #define COMPONENT_DEFAULT_RANK 999 //default rank for components without rank information #define COMPONENT_RANK_SELECTION_MAX 8 //max rank to be considered in component selection @@ -18,11 +19,13 @@ extern int component_rank_max; * */ enum { + IDENTIFIED_FILTERED = -1, IDENTIFIED_NONE = 0, IDENTIFIED_PURL, - IDENTIFIED_PURL_VERSION + IDENTIFIED_PURL_VERSION, }; + typedef struct component_data_t { char * vendor; /* component vendor */ @@ -62,6 +65,40 @@ typedef struct component_data_t int third_party_rank; /* Saves third party ranking*/ } component_data_t; +typedef struct keywords +{ + int count; + char word[MAX_FIELD_LN]; +} keywords; + + +typedef struct file_recordset +{ + uint8_t url_id[MD5_LEN]; + char path[MAX_FILE_PATH]; + int path_ln; + bool external; +} file_recordset; + +typedef struct len_rank +{ + int id; + int len; +} len_rank; + +typedef struct component_item +{ + char * vendor; + char * component; + char * purl; + char * version; + char * license; +} component_item; + +extern component_item *ignore_components; +extern component_item *declared_components; + + component_data_t * component_init(void); void component_data_free(component_data_t * data); bool fill_component(component_data_t * component, uint8_t *url_key, char *file_path, uint8_t *url_record); diff --git a/inc/limits.h b/inc/limits.h index 69122a4..c4f781d 100644 --- a/inc/limits.h +++ b/inc/limits.h @@ -34,6 +34,10 @@ #define MAX_QUERY_RESPONSE (1024 * 1024 * 8) #define SLOW_QUERY_LIMIT_IN_USEC 2000000 #define MAX_JSON_VALUE_LEN 4096 +#define MAX_FILE_PATH 1024 +#define FETCH_MAX_FILES_DEFAULT 12000 +#define MIN_FILE_SIZE 256 // files below this size will be ignored +#define CRC_LIST_LEN 1024 // list of crc checksums to avoid metadata duplicates /* Snippets */ #define DEFAULT_MATCHMAP_FILES 10000 // Default number of files evaluated in snippet matching @@ -41,18 +45,13 @@ #define MIN_LINES_COVERAGE 0.8 #define SKIP_SNIPPETS_IF_FILE_BIGGER (1024 * 1024 * 4) #define MAX_SNIPPETS_SCANNED 2500 - +#define SNIPPETS_DEFAULT_RANGE_TOLERANCE 5 /** A maximum number of non-matched lines tolerated inside a matching range */ +#define SNIPPETS_DEFAULT_MIN_MATCH_LINES 5 /** Minimum number of lines matched for a match range to be acepted */ +#define SNIPPETS_DEFAULT_MIN_MATCH_HITS 2 /** Minimum number of snippet ID hits to produce a snippet match*/ +#define SNIPPETS_DEFAULT_ADJUST_TOLERANCE true /** Adjust tolerance based on file size */ +#define SNIPPETS_DEFAULT_HONOR_FILE_EXTENSION true /** Honor file extension during snippet matching */ +#define DEFAULT_FETCH_MAX_FILES 12000 /** Maximum number of files to fetch during component matching */ /* Variables */ - -/* During snippet scanning, when a wfp (with more than consecutive_threshold wfps) produces a score higher - than consecutive_score by consecutive_hits in a row, the scan will skip consecutive_jump snippets */ -extern int consecutive_score; -extern int consecutive_hits; -extern int consecutive_jump; -extern int consecutive_threshold; - -extern int range_tolerance; // A maximum number of non-matched lines tolerated inside a matching range -extern int min_match_lines; // Minimum number of lines matched for a match range to be acepted -extern int min_match_hits; // Minimum number of snippet ID hits to produce a snippet match +extern int fetch_max_files; // Maximum number of files to fetch during component matching #endif diff --git a/inc/match.h b/inc/match.h index a807919..e1edced 100644 --- a/inc/match.h +++ b/inc/match.h @@ -25,7 +25,6 @@ typedef struct match_data_t uint32_t * crclist; /* pointer to crc list used in for processing */ char * quality_text; /* quality string used in json output format */ char * crytography_text; /* crytography string used in json output format */ - uint16_t from; } match_data_t; match_data_t * match_data_copy(match_data_t * in); diff --git a/inc/parse.h b/inc/parse.h index 27e3984..1a367dc 100644 --- a/inc/parse.h +++ b/inc/parse.h @@ -4,6 +4,7 @@ #include #include #include "scanoss.h" +#include "component.h" void extract_csv(char *out, char *in, int n, long limit); void lowercase(char *word); diff --git a/inc/scan.h b/inc/scan.h index 4407910..1de390a 100644 --- a/inc/scan.h +++ b/inc/scan.h @@ -66,16 +66,22 @@ typedef struct scan_data_t int max_matchmap_size; bool printed_succed; bool windows_line_endings; + bool snippet_adjust_tolerance; // Enable adjust snippet tolerance based on file size + int component_ranking_threshold; //-1 = disable ranking. 0 = all accepted + int snippet_min_hits; + int snippet_min_lines; + int snippet_range_tolerance; + bool snippet_honor_file_extension; } scan_data_t; extern bool force_snippet_scan; -scan_data_t * scan_data_init(char *target, int max_snippets, int max_components); +scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension); void scan_data_free (scan_data_t * scan); void ldb_scan(scan_data_t * scan); match_t ldb_scan_snippets(scan_data_t *scan_ptr); -int wfp_scan(char * path, int scan_max_snippets, int scan_max_components); -int hash_scan(char *path, int scan_max_snippets, int scan_max_components); +int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension); +int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension); #endif diff --git a/inc/scanoss.h b/inc/scanoss.h index e32c313..8c4bbc4 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -30,16 +30,10 @@ #include #include "limits.h" -#define MAX_FILE_PATH 1024 -#define FETCH_MAX_FILES 12000 -#define MIN_FILE_SIZE 256 // files below this size will be ignored -#define CRC_LIST_LEN 1024 // list of crc checksums to avoid metadata duplicates -#define SNIPPET_LINE_TOLERANCE 10 - #define WFP_LN 4 #define WFP_REC_LN 18 -#define SCANOSS_VERSION "5.4.19" +#define SCANOSS_VERSION "5.4.20" /* Log files */ #define SCAN_LOG "/tmp/scanoss_scan.log" @@ -84,40 +78,8 @@ extern const char *dependency_sources[]; typedef enum {MATCH_NONE, MATCH_FILE, MATCH_SNIPPET, MATCH_BINARY} match_t; -typedef struct keywords -{ - int count; - char word[MAX_FIELD_LN]; -} keywords; - - -typedef struct file_recordset -{ - uint8_t url_id[MD5_LEN]; - char path[MAX_FILE_PATH]; - int path_ln; - bool external; -} file_recordset; - -typedef struct len_rank -{ - int id; - int len; -} len_rank; - -typedef struct component_item -{ - char * vendor; - char * component; - char * purl; - char * version; - char * license; -} component_item; - - extern long microseconds_start; extern int map_rec_len; -extern bool match_extensions; /*component hint hold the last component matched/guessed */ extern char * component_hint; @@ -141,12 +103,8 @@ extern struct ldb_table oss_notices; extern bool first_file; -extern int max_vulnerabilities; extern char *ignored_assets; -extern component_item *ignore_components; -extern component_item *declared_components; - /* Prototype declarations */ diff --git a/src/binary_scan.c b/src/binary_scan.c index a7f7a88..2008c90 100644 --- a/src/binary_scan.c +++ b/src/binary_scan.c @@ -270,7 +270,7 @@ int binary_scan(char * input) char * file_name = field_n(3,input); int target_len = strchr(file_name,',') - file_name; char * target = strndup(file_name, target_len); - scan_data_t * scan = scan_data_init(target, 1, 1); + scan_data_t * scan = scan_data_init(target, 1, 1, true, 0, 3, 5, SNIPPETS_DEFAULT_RANGE_TOLERANCE, false); free(target); memcpy(scan->md5, bin_md5, MD5_LEN); scan->match_type = MATCH_FILE; diff --git a/src/component.c b/src/component.c index 8906313..8b4bc02 100644 --- a/src/component.c +++ b/src/component.c @@ -253,7 +253,7 @@ bool fill_component(component_data_t *component, uint8_t *url_key, char *file_pa extract_csv(license, (char *)url_record, 5, sizeof(license)); extract_csv(purl, (char *)url_record, 6, sizeof(purl)); extract_csv(url, (char *)url_record, 7, sizeof(url)); - extract_csv(rank, (char *)url_record, 13, sizeof(rank)); //extracts the rank field if available + extract_csv(rank, (char *)url_record, 14, sizeof(rank)); //extracts the rank field if available /* Fill url stats if these are available*/ for (int i = 0; i < 5; i++) { char stat[16] = "\0"; @@ -292,10 +292,10 @@ bool fill_component(component_data_t *component, uint8_t *url_key, char *file_pa MD5((uint8_t *)component->purls[0], strlen(component->purls[0]), component->purls_md5[0]); } component->age = -1; - if (*rank && strlen(rank) < 3) + if (*rank) { component->rank = atoi(rank); - //scanlog("Component rank from DB: %d\n", component->rank); + //scanlog("Component rank from DB: %s- %d\n", rank, component->rank); } else component->rank = COMPONENT_DEFAULT_RANK; diff --git a/src/debug.c b/src/debug.c index 270c35b..e65c9c6 100644 --- a/src/debug.c +++ b/src/debug.c @@ -138,7 +138,7 @@ void scan_benchmark() for (int f = 0; f < total_files ; f++) { - scan_data_t * scan = scan_data_init("pseudo_file", 0, 0); + scan_data_t * scan = scan_data_init("pseudo_file", 0, 0, true, 0, 3, 5, SNIPPETS_DEFAULT_RANGE_TOLERANCE, false); scan->preload = true; memcpy(scan->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", MD5_LEN); strcpy(scan->file_size, "1024"); diff --git a/src/file.c b/src/file.c index 204915e..d56f9fe 100644 --- a/src/file.c +++ b/src/file.c @@ -192,8 +192,8 @@ int dir_count(char *path) bool collect_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { - /* Leave if FETCH_MAX_FILES is reached */ - if (iteration >= FETCH_MAX_FILES) return true; + /* Leave if fetch_max_files is reached */ + if (iteration >= fetch_max_files) return true; /* Ignore path lengths over the limit */ if (!datalen || datalen >= (MD5_LEN + MAX_FILE_PATH)) return false; @@ -231,7 +231,7 @@ bool count_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_ int * count = ptr; *count = iteration; - if (iteration >= FETCH_MAX_FILES) + if (iteration >= fetch_max_files) { return true; } diff --git a/src/help.c b/src/help.c index bb90b46..075f9fb 100644 --- a/src/help.c +++ b/src/help.c @@ -32,6 +32,8 @@ #include "help.h" #include "scanoss.h" #include "limits.h" +#include "match_list.h" +#include "component.h" /** * @brief Print the help @@ -46,34 +48,41 @@ Results are displayed in JSON format through STDOUT.\n\ Syntax: scanoss [parameters] [TARGET]\n\ \n\ Configuration:\n\ --w Process TARGET as a .wfp file, regardless of its actual extension.\n\ --H Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system).\n\ --e Match only files with identical extensions as the scanned file (default: off).\n\ --M NUMBER Search for up to NUMBER different components in each file (maximum: 9).\n\ --T NUMBER Set snippet scanning tolerance percentage (default: 0.1).\n\ --s SBOM Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ --b SBOM Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ --B SBOM Same as \"-b\" but with forced snippet scanning.\n\ --a SBOM Show attribution notices for the provided SBOM.json file.\n\ --c HINT Add a component HINT to guide scan results.\n\ --k KEY Show contents of the specified KEY file from MZ sources archive.\n\ --l LICENSE Display OSADL metadata for the given SPDX license ID.\n\ --L Enable license full reort.\n\ +-w, --wfp Process TARGET as a .wfp file, regardless of its actual extension.\n\ +-H, --hpsm Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system).\n\ +-M, --max-snippets NUM Search for up to NUM different components in each file (maximum: 9).\n\ +-N, --max-components NUM Set maximum number of components (default: %d).\n\ +-T, --tolerance NUM Set snippet scanning tolerance percentage (default: 0.1).\n\ +-r, --rank NUM Set maximum component rank accepted (default: %d).\n\ + --max-files NUM Set maximum number of files to fetch during matching (default: 12000).\n\ + --min-snippet-hits NUM Set minimum snippet ID hits for a match (default: 3, disables auto-adjust).\n\ + --min-snippet-lines NUM Set minimum matched lines for a range (default: 10, disables auto-adjust).\n\ + --range-tolerance NUM Set max non-matched lines tolerated in a range (default: 5).\n\ + --ignore-file-ext Ignore file extension during snippet matching (default: honor extension).\n\ +-s, --sbom FILE Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ +-b, --blacklist FILE Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ + --force-snippet Force snippet scanning (no full file matching).\n\ +-a, --attribution FILE Show attribution notices for the provided SBOM.json file.\n\ +-c, --component HINT Add a component HINT to guide scan results.\n\ +-k, --key KEY Show contents of the specified KEY file from MZ sources archive.\n\ +-l, --license LICENSE Display OSADL metadata for the given SPDX license ID.\n\ +-L, --full-license Enable full license report.\n\ +-F, --flags FLAGS Set engine scanning flags (see below).\n\ \n\ Options:\n\ --t Run engine performance tests.\n\ --v Show version information and exit.\n\ --n Set database name (default: oss).\n\ --h Display this help information and exit.\n\ --d Store debugging information to disk (/tmp).\n\ --q Suppress JSON output (show only debugging info via STDERR).\n\ +-t, --test Run engine performance tests.\n\ +-v, --version Show version information and exit.\n\ +-n, --name NAME Set database name (default: oss).\n\ +-h, --help Display this help information and exit.\n\ +-d, --debug Store debugging information to disk (/tmp).\n\ +-q, --quiet Suppress JSON output (show only debugging info via STDERR).\n\ \n\ Environment variables:\n\ SCANOSS_MATCHMAP_MAX: Set the snippet scanning match map size (default: %d).\n\ -SCANOSS_FILE_CONTENTS_URL: Define the API URL endpoint for sources. Source url wont be reported if it's not defined.\n\ +SCANOSS_FILE_CONTENTS_URL: Define the API URL endpoint for sources. Source URL won't be reported if not defined.\n\ \n\ Engine scanning flags:\n\ -Configure the scanning engine using flags with the -F parameter.\n\ +Configure the scanning engine using flags with the -F/--flags parameter.\n\ These settings can also be specified in %s\n\ +-------+-------------------------------------------------------+\n\ | Flag | Setting |\n\ @@ -94,7 +103,10 @@ These settings can also be specified in %s\n\ | 8192 | Disable health layer (default: enabled) |\n\ | 16384 | Enable high accuracy, slower scan (default: disabled) |\n\ +-------+-------------------------------------------------------+\n\ -Example: scanoss -F 12 DIRECTORY (scan DIRECTORY without license and dependency data)\n\ +Examples:\n\ + scanoss -F 12 DIRECTORY Scan DIRECTORY without license and dependency data\n\ + scanoss --flags 12 DIRECTORY Same as above using long option\n\ + scanoss --sbom my_sbom.json TARGET Scan TARGET including SBOM assets\n\ \n\ -Copyright (C) 2018-2022 SCANOSS.COM\n", DEFAULT_MATCHMAP_FILES, ENGINE_FLAGS_FILE); +Copyright (C) 2018-2022 SCANOSS.COM\n", SCAN_MAX_COMPONENTS_DEFAULT, COMPONENT_DEFAULT_RANK + 1, DEFAULT_MATCHMAP_FILES, ENGINE_FLAGS_FILE); } diff --git a/src/license.c b/src/license.c index 13a8592..74a2b22 100644 --- a/src/license.c +++ b/src/license.c @@ -43,32 +43,79 @@ #include "file.h" #include "query.h" -/** @brief License sources - 0 = Declared in component - 1 = Declared in file with SPDX-License-Identifier - 2 = Detected in header - 3 = Declared in LICENSE file - 4 = Scancode detection - 5 = Scancode detection at mining time - 6 = osslot */ -const char *license_sources[] = {"component_declared", "file_spdx_tag", "file_header", "license_file", "scancode-file", "scancode", "osselot"}; -bool full_license_report = false; +bool full_license_report = false; +struct license_type +{ + char * text; + int id; +}; struct license_list { - char **licenses; + struct license_type*licenses; int count; }; -bool license_add_to_list(struct license_list * ptr, char * license) +//convert license id to license report name +static char * license_id_to_source_name(int id) +{ + switch (id) + { + case 0: + case 35: + return "component_declared"; + case 1: + return "file_spdx_tag"; + case 2: + return "file_header"; + case 3: + case 31: + return "license_file"; + case 4: + return "scancode_file"; + case 5: + return "scancode"; + case 6: + return "component_declared"; + case 7: + case 9: + return "underlying_component"; + case 71: + case 72: + case 73: + case 74: + return "underlying_license_file"; + case 8: + return "scancode"; + + case 10: + return "osselot"; + default: + return NULL; + } +} + +bool license_add_to_list(struct license_list * ptr, char * license, int license_id) { if (!ptr || !license || strlen(license) < 2) return false; - ptr->licenses = realloc(ptr->licenses, sizeof(char *) * (ptr->count + 1)); - if (!ptr->licenses) + + struct license_type *tmp = realloc(ptr->licenses, sizeof(struct license_type) * (ptr->count + 1)); + if (!tmp) + return false; + ptr->licenses = tmp; + ptr->licenses[ptr->count].text = NULL; + ptr->licenses[ptr->count].id = 0; + + /* Allocate with extra padding for CRC32C hardware reads (8-byte blocks) */ + size_t len = strlen(license); + size_t padded_len = ((len + 8) / 8) * 8; /* Round up to next 8-byte boundary */ + ptr->licenses[ptr->count].text = calloc(1, padded_len); + if (!ptr->licenses[ptr->count].text) return false; - ptr->licenses[ptr->count] = strdup(license); + strcpy(ptr->licenses[ptr->count].text, license); + ptr->licenses[ptr->count].id = license_id; ptr->count++; return true; } @@ -79,13 +126,30 @@ void license_free_list(struct license_list * ptr) return; for (int i = 0; i < ptr->count; i++) { - free(ptr->licenses[i]); + free(ptr->licenses[i].text); } free(ptr->licenses); ptr->licenses = NULL; ptr->count = 0; } +static int license_compare_by_id(const void *a, const void *b) +{ + const struct license_type *la = a; + const struct license_type *lb = b; + + /* IDs 5 and 6 should go to the end */ + bool a_is_last = (la->id == 5 || la->id == 6); + bool b_is_last = (lb->id == 5 || lb->id == 6); + + if (a_is_last && !b_is_last) + return 1; + if (!a_is_last && b_is_last) + return -1; + + return la->id - lb->id; +} + /** * @brief Remove invalid characters from a license name * @param license license string @@ -222,6 +286,13 @@ static char *json_from_license(uint32_t *crclist, char *buffer, char *license, i if (!*license || strlen(license) < 2) return buffer; + + char * license_source_id = license_id_to_source_name(src); + if (!license_source_id) + return buffer; + //skip scancode licenses starting with "LicenseRef" + if (!strncmp(license_source_id, "scancode", 8) && !strncmp(license, "LicenseRef", 10)) + return buffer; /* Calculate CRC to avoid duplicates */ uint32_t CRC = string_crc32c(license); @@ -241,7 +312,7 @@ static char *json_from_license(uint32_t *crclist, char *buffer, char *license, i len += sprintf(buffer + len, "{"); len += sprintf(buffer + len, "\"name\": \"%s\",", license); len += osadl_print_license(buffer + len, license, true); - len += sprintf(buffer + len, "\"source\": \"%s\"", license_sources[src]); + len += sprintf(buffer + len, "\"source\": \"%s\"", license_source_id); if (!strstr(license, "LicenseRef")) len += sprintf(buffer + len, ",\"url\": \"https://spdx.org/licenses/%s.html\"", license); len += sprintf(buffer + len, "}"); @@ -275,7 +346,7 @@ static char *split_in_json_array(uint32_t *crclist, char *buffer, char *license, return r; // Return the updated buffer pointer, not the original } -char * license_to_json(uint32_t *crclist, char *buffer, char *license, int src, bool *first_record) +char * license_to_json(uint32_t *crclist, char *buffer, char *license, int src, bool *first_record) { if (!strchr(license, '/')) return json_from_license(crclist, buffer, license, src, first_record); @@ -338,8 +409,7 @@ bool print_licenses_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * int src = atoi(source); scanlog("Fetched License %s - source ID %d\n", license, src); - if (src < (sizeof(license_sources) / sizeof(license_sources[0]))) - license_add_to_list(&licenses[src], license); + license_add_to_list(licenses, license, src); free(source); free(license); @@ -369,14 +439,12 @@ void print_licenses(component_data_t *comp) uint32_t records = 0; comp->license_text = NULL; - int license_types = sizeof(license_sources) / sizeof(license_sources[0]); - struct license_list licenses_by_type[license_types]; - memset(licenses_by_type, 0, sizeof(licenses_by_type)); + struct license_list licenses_by_type = {.count = 0, .licenses = NULL}; /* Print URL license */ if (comp->license && strlen(comp->license) > 2) { - license_add_to_list(&licenses_by_type[0], comp->license); + license_add_to_list(&licenses_by_type, comp->license, 0); scanlog("License present in URL table"); } else @@ -404,7 +472,7 @@ void print_licenses(component_data_t *comp) //Look if someone of the prefered liceses ids already has a match for (int i = 0; i < 4; i++) { - if (licenses_by_type[i].count > 0) + if (licenses_by_type.count > 0) { scanlog("Stop searching for licenses\n"); break; @@ -440,26 +508,19 @@ void print_licenses(component_data_t *comp) buffer = result + len; bool first = true; - for (int i = 0; i < license_types; i++) + /* Sort licenses by id (ascending) */ + if (licenses_by_type.count > 1) + qsort(licenses_by_type.licenses, licenses_by_type.count, sizeof(struct license_type), license_compare_by_id); + + for (int i = 0; i < licenses_by_type.count; i++) { - if (licenses_by_type[i].count > 0) - { - if (i > 3 && !first && !full_license_report) - break; - for (int j = 0; j < licenses_by_type[i].count; j++) - { - buffer = license_to_json(crclist, buffer, licenses_by_type[i].licenses[j], i, &first); - } - } + buffer = license_to_json(crclist, buffer, licenses_by_type.licenses[i].text, licenses_by_type.licenses[i].id, &first); } len = buffer - result; len += sprintf(result + len, "]"); comp->license_text = result; - /* Free all license lists */ - for (int i = 0; i < license_types; i++) - { - license_free_list(&licenses_by_type[i]); - } + license_free_list(&licenses_by_type); + } diff --git a/src/limits.c b/src/limits.c index 8471794..67b6cfd 100644 --- a/src/limits.c +++ b/src/limits.c @@ -9,8 +9,4 @@ * @see https://github.com/scanoss/engine/blob/master/src/limits.c */ -int range_tolerance = 5; /** A maximum number of non-matched lines tolerated inside a matching range */ -int min_match_lines = 10; /** Minimum number of lines matched for a match range to be acepted */ -int min_match_hits = 4; /** Minimum number of snippet ID hits to produce a snippet match*/ - -const int max_vulnerabilities = 50; /** Show only the first N vulnerabilities */ \ No newline at end of file +int fetch_max_files = 12000; /** Maximum number of files to fetch during component matching */ diff --git a/src/main.c b/src/main.c index b163889..58af1ac 100644 --- a/src/main.c +++ b/src/main.c @@ -46,6 +46,7 @@ #include #include "hpsm.h" #include +#include struct ldb_table oss_url; struct ldb_table oss_file; @@ -63,6 +64,13 @@ struct ldb_table oss_notices; component_item *ignore_components; component_item *declared_components; +int scan_min_match_lines = SNIPPETS_DEFAULT_MIN_MATCH_LINES; // Minimum number of lines matched for a match range to be acepted +int scan_min_match_hits = SNIPPETS_DEFAULT_MIN_MATCH_HITS; // Minimum number of snippet ID hits to produce a snippet match +int scan_range_tolerance = SNIPPETS_DEFAULT_RANGE_TOLERANCE; // Maximum number of non-matched lines tolerated inside a matching range +bool scan_adjust_tolerance = SNIPPETS_DEFAULT_ADJUST_TOLERANCE; /** Adjust tolerance based on file size */ +int scan_ranking_threshold = 0; //enabled, all accepted by default +bool scan_honor_file_extension = SNIPPETS_DEFAULT_HONOR_FILE_EXTENSION; + bool lib_encoder_present = false; #define LDB_VER_MIN "4.1.0" @@ -207,10 +215,12 @@ void recurse_directory(char *name) if (extension(path)) if (!strcmp(extension(path), "wfp")) wfp = true; if (wfp) - wfp_scan(path, scan_max_snippets, scan_max_components); + wfp_scan(path, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); else { - scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components); + scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); ldb_scan(scan); } @@ -261,8 +271,37 @@ uint64_t read_flags() return 0; } +/* Long options structure for getopt_long */ +static struct option long_options[] = { + {"rank", required_argument, 0, 'r'}, + {"tolerance", required_argument, 0, 'T'}, + {"sbom", required_argument, 0, 's'}, + {"blacklist", required_argument, 0, 'b'}, + {"force-snippet", required_argument, 0, 256}, /* Long option only, no short form */ + {"component", required_argument, 0, 'c'}, + {"key", required_argument, 0, 'k'}, + {"attribution", required_argument, 0, 'a'}, + {"flags", required_argument, 0, 'F'}, + {"license", required_argument, 0, 'l'}, + {"full-license", no_argument, 0, 'L'}, + {"name", required_argument, 0, 'n'}, + {"max-snippets", required_argument, 0, 'M'}, + {"max-components", required_argument, 0, 'N'}, + {"max-files", required_argument, 0, 257}, /* Long option only */ + {"min-snippet-hits", required_argument, 0, 258}, /* Long option only */ + {"min-snippet-lines", required_argument, 0, 259}, /* Long option only */ + {"ignore-file-ext", no_argument, 0, 260}, /* Long option only */ + {"range-tolerance", required_argument, 0, 261}, /* Long option only */ + {"wfp", no_argument, 0, 'w'}, + {"test", no_argument, 0, 't'}, + {"version", no_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"debug", no_argument, 0, 'd'}, + {"quiet", no_argument, 0, 'q'}, + {"hpsm", no_argument, 0, 'H'}, + {0, 0, 0, 0} +}; -int component_rank_max = COMPONENT_DEFAULT_RANK + 1; /*Used defined max component rank accepted*/ /** * @brief //TODO * @param argc //TODO @@ -291,9 +330,11 @@ int main(int argc, char **argv) /* Parse arguments */ int option; + int option_index = 0; bool invalid_argument = false; char * ldb_db_name = NULL; - while ((option = getopt(argc, argv, ":r:T:s:b:B:c:k:a:F:l:n:M:N:wtLvhedqH")) != -1) + + while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:wtLvhdqH", long_options, &option_index)) != -1) { /* Check valid alpha is entered */ if (optarg) @@ -321,7 +362,8 @@ int main(int argc, char **argv) component_hint = strdup(optarg); break; case 'r': - component_rank_max = atoi(optarg); + scan_ranking_threshold = atoi(optarg); + scanlog("Max component rank set to %d\n", scan_ranking_threshold); break; case 'k': @@ -368,8 +410,7 @@ int main(int argc, char **argv) case 'w': force_wfp = true; break; - case 'B': - ignore_components = get_components(optarg); + case 256: /* --force-snippet (long option only) */ force_snippet_scan = true; break; case 't': @@ -388,10 +429,6 @@ int main(int argc, char **argv) exit(EXIT_SUCCESS); break; - case 'e': - match_extensions = true; - break; - case 'q': engine_flags = engine_flags_cmd_line; debug_on = true; @@ -402,7 +439,7 @@ int main(int argc, char **argv) case 'd': engine_flags = engine_flags_cmd_line; debug_on = true; - scanlog(""); // Log time stamp + scanlog_init(); break; case ':': @@ -414,7 +451,35 @@ int main(int argc, char **argv) printf("Unsupported option: %c\n", optopt); invalid_argument = true; break; - + + case 257: /* --max-files */ + fetch_max_files = atoi(optarg); + scanlog("Max files to fetch set to %d\n", fetch_max_files); + break; + + case 258: /* --min-match-hits */ + scan_min_match_hits = atoi(optarg); + scan_adjust_tolerance = false; + scanlog("Min match hits set to %d (auto-adjust disabled)\n", scan_min_match_hits); + break; + + case 259: /* --min-match-lines */ + scan_min_match_lines = atoi(optarg); + scan_adjust_tolerance = false; + scanlog("Min match lines set to %d (auto-adjust disabled)\n", scan_min_match_lines); + break; + + case 260: /* --ignore-file-ext */ + scan_honor_file_extension = false; + scanlog("File extension matching disabled\n"); + break; + + case 261: /* --range-tolerance */ + scan_range_tolerance = atoi(optarg); + scan_adjust_tolerance = false; + scanlog("Range tolerance set to %d\n", scan_range_tolerance); + break; + case 'H': if (hpsm_lib_load()) hpsm_enabled = true; @@ -481,8 +546,9 @@ int main(int argc, char **argv) else { /* Init scan structure */ - if (ishash) - hash_scan(target, scan_max_snippets, scan_max_components); + if (ishash) + hash_scan(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); else { bool wfp_extension = false; @@ -494,8 +560,9 @@ int main(int argc, char **argv) if (force_bfp) bfp_extension = true; /* Scan wfp file */ - if (wfp_extension) - wfp_scan(target, scan_max_snippets, scan_max_components); + if (wfp_extension) + wfp_scan(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); else if (bfp_extension) binary_scan(target); @@ -504,12 +571,11 @@ int main(int argc, char **argv) else { scanlog("Scanning file %s\n", target); - scan_data_t * scan = scan_data_init(target, scan_max_snippets, scan_max_components); + scan_data_t * scan = scan_data_init(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); ldb_scan(scan); } } - - } /* Close main report structure */ diff --git a/src/match.c b/src/match.c index 6b0770f..12399eb 100644 --- a/src/match.c +++ b/src/match.c @@ -253,71 +253,167 @@ static void evaluate_path_rank(component_data_t *comp) } /** - * @brief Funtion to be called as pointer when a new compoent has to be loaded in to the list - * - * @param a existent component in the list - * @param b new component to be added - * @return true b has to be included in the list before "a" - * @return false "a" wins, compare with the next component. + * @brief Initialize component age by computing MD5 of purl and fetching age from database + * @param comp Component to initialize + */ +static inline void initialize_component_age(component_data_t *comp) +{ + if (!comp->purls_md5[0] && comp->purls[0]) + { + comp->purls_md5[0] = malloc(MD5_LEN); + if (!comp->purls_md5[0]) + { + scanlog("critical: MD5 memory allocation failed"); + return; + } + MD5((uint8_t *)comp->purls[0], strlen(comp->purls[0]), comp->purls_md5[0]); + comp->age = get_component_age(comp->purls_md5[0]); + } +} + +/** + * @brief Compare two integer values and return comparison result + * @param val_a Value from component a + * @param val_b Value from component b + * @param prefer_higher If true, higher value wins; if false, lower value wins + * @return 1 if b wins, -1 if a wins, 0 if tie + */ +static inline int compare_int_values(int val_a, int val_b, bool prefer_higher) +{ + if (val_a == val_b) + return 0; + + if (prefer_higher) + return (val_b > val_a) ? 1 : -1; + else + return (val_b < val_a) ? 1 : -1; +} + +int compare_file_extension(component_data_t *a, component_data_t *b) +{ + if (!a->file_path_ref) + return 0; + + char *ext_file = extension(a->file_path_ref); + if (!ext_file) + return 0; + + char *ext_a = extension(a->file); + char *ext_b = extension(b->file); + + if (!ext_a && ext_b) + return 1; + + if (ext_a && !ext_b) + return -1; + + if (!ext_a && !ext_b) + return 0; + + int result_a = strcmp(ext_a, ext_file); + int result_b = strcmp(ext_b, ext_file); + + if (result_a == result_b) + return 0; + else if (!result_a) + return -1; + else if (!result_b) + return 1; + + return 0; +} + +/** + * @brief Component comparison function for determining insertion order in the component list + * + * This function implements the component selection logic using multiple hierarchical criteria: + * 1. Declared components (SBOM) evaluation + * 2. Component hints (purl and component name matching) + * 3. Path rank hint evaluation (file path similarity) + * 4. Release date validation + * 5. File extension matching + * 6. Third-party path evaluation + * 7. URL ranking and binary purl matching + * 8. Tiebreakers for equal release dates (source check, health metrics, vendor check, component age, version comparison) + * 9. Final selection based on oldest release date + * + * @param a Existing component in the list to compare against + * @param b New candidate component to be added + * @return true If component 'b' should be inserted before 'a' (b wins) + * @return false If component 'a' wins, continue comparing with the next component */ static bool component_hint_date_comparation(component_data_t *a, component_data_t *b) { + // 1. Declared components (SBOM) evaluation + // Prioritize components that are declared in the SBOM (Software Bill of Materials) + // identified > 0 means the component was declared/identified in the SBOM if (declared_components) { scanlog("ASSETS eval- %d / %d\n", a->identified, b->identified); - if (a->identified > b->identified) - { - scanlog("Reject component %s@%s by SBOM\n", b->purls[0], b->version); - return false; - } - - if (b->identified > a->identified) + if (a->identified != b->identified) { + // Keep component 'a' if it's identified and 'b' is not + if (a->identified > b->identified) + { + scanlog("Reject component %s@%s by SBOM\n", b->purls[0], b->version); + return false; + } + // Accept component 'b' if it's identified and 'a' is not scanlog("Accept component %s@%s by SBOM\n", b->purls[0], b->version); return true; } } - + // 2. Component hint evaluation + // Apply user-provided component hints to influence selection + // Hints can match against purl or component names else if (component_hint) { scanlog("hint eval\n"); - int result = hint_eval(a,b); - if (result > 0) - return true; - if (result < 0) - return false; + int hint_result = hint_eval(a,b); + if (hint_result != 0) + return hint_result > 0; } - + + // 3. Path rank hint evaluation + // Compare file path similarity between scanned file and component file paths + // Lower rank means better similarity (more matching path components) if ((engine_flags & ENABLE_PATH_HINT) && a->file_path_ref && b->file_path_ref) { - //evalute path rank for component a evaluate_path_rank(a); - - //evalute path rank for component b evaluate_path_rank(b); - //The path_rank will be used as hint only when it has a reasonable value, in other cases the critea will be ignored. - if (b->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) + const int rank_threshold = PATH_LEVEL_COMP_REF / 3 + 1; + + // Path rank is used as hint only when it has a reasonable value (below threshold) + // This prevents poor matches from being selected based on path alone + if (b->path_rank < rank_threshold) { - if (b->path_rank - a->path_rank < 0) + int rank_diff = b->path_rank - a->path_rank; + // Component 'b' has better path similarity than 'a' + if (rank_diff < 0) { scanlog("%s wins %s by path rank %d\n", b->purls[0], a->purls[0], b->path_rank); return true; } - if (b->path_rank - a->path_rank > 0) + // Component 'a' has better path similarity than 'b' + if (rank_diff > 0) { scanlog("%s - %s loses %s by path rank %d/%d\n", b->purls[0],b->file, a->purls[0], b->path_rank, a->path_rank); return false; } } - else if (a->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) + // If only 'a' has a good path rank, keep it + else if (a->path_rank < rank_threshold) { scanlog("%s rejected, %s wins by path rank %d\n", b->purls[0], a->purls[0], a->path_rank); return false; } } + // 4. Release date validation + // Reject components without valid release dates + // Components must have release date information to be considered if (!*b->release_date) { scanlog("%s rejected due to empty release date\n", b->purls[0]); @@ -329,168 +425,250 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return true; } - // Third-party path evaluation + // 5. File extension matching + // Prefer components where the file extension matches the scanned file + int file_extension_comp = compare_file_extension(a, b); + if (file_extension_comp < 0) + { + // Component 'a' has matching extension, 'b' does not + scanlog("%s rejected by file extension match\n", b->purls[0]); + return false; + } + else if (file_extension_comp > 0) + { + // Component 'b' has matching extension, 'a' does not + scanlog("%s accepted by file extension mismatch\n", b->purls[0]); + return true; + } + + // 6. Third-party path evaluation + // Prefer components from third-party directories (vendor, external, 3rdparty, etc.) + // Higher score means more likely to be a third-party component int tp_a = path_is_third_party(a); int tp_b = path_is_third_party(b); + int tp_diff = tp_a - tp_b; - if (tp_a - tp_b > 4) + // If 'a' is significantly more third-party than 'b' (difference > 7), reject 'b' + if (tp_diff > 7) { scanlog("Component rejected by third party path filter (%s=%d=%s > %s=%d=%s)\n", a->purls[0], tp_a,a->file, b->purls[0], tp_b, b->file); return false; } - else if (tp_b - tp_a > 4) + // If 'b' is significantly more third-party than 'a' (difference < -7), accept 'b' + if (tp_diff < -7) { - scanlog("Component accepted by third party path filter (%s=%d < %s=%d)\n", a->purls[0], tp_a, b->purls[0], tp_b); + scanlog("Component accepted by third party path filter (%s=%d=%s < %s=%d=%s)\n", a->purls[0], tp_a, a->file, b->purls[0], tp_b, b->file); return true; } - //when the url ranking is enabled + // 7. URL ranking and binary purl matching + // When URL ranking is enabled (rank < COMPONENT_DEFAULT_RANK), use ranking metrics + // Lower rank values indicate higher quality/more authoritative sources if (b->rank < COMPONENT_DEFAULT_RANK || a->rank < COMPONENT_DEFAULT_RANK) - { + { + // 7.1. Binary file to purl matching + // Check if the component's purl matches what would be expected for a binary file bool good_purl_a = binary_file_to_purl(a); bool good_purl_b = binary_file_to_purl(b); - if (good_purl_b && !good_purl_a) - { - scanlog("Component %s prefered over %s by binary purl match\n", b->purls[0], a->purls[0]); - return true; - } - else if (good_purl_a && !good_purl_b) + + if (good_purl_b != good_purl_a) { + // Prefer component with matching binary purl + if (good_purl_b) + { + scanlog("Component %s prefered over %s by binary purl match\n", b->purls[0], a->purls[0]); + return true; + } scanlog("Component %s rejected by binary purl match\n", b->purls[0]); return false; } + // If both have good binary purls, check vendor+component match + else if (good_purl_b && good_purl_a) + { + // 7.2. Vendor component check for binary purls + // Verify if vendor and component names align with the purl + bool vendor_check_a = purl_vendor_component_check(a); + bool vendor_check_b = purl_vendor_component_check(b); + + if (vendor_check_a != vendor_check_b) + { + if (vendor_check_b) + { + scanlog("Component %s prefered over %s by vendor+component=purl\n", b->purls[0], a->purls[0]); + return true; + } + scanlog("Component %s rejected, %s wins by vendor+component=purl\n", b->purls[0], a->purls[0]); + return false; + } + } + // 7.3. Rank threshold check + // Reject components with rank above the maximum selection threshold if (b->rank >= COMPONENT_RANK_SELECTION_MAX && a->rank < COMPONENT_RANK_SELECTION_MAX) { scanlog("%s rejected by rank threshold %d >= %d\n", b->purls[0], b->rank, COMPONENT_RANK_SELECTION_MAX); return false; } - - //lower rank selection logic + + // 7.4. Lower rank selection logic + // For components with acceptable ranks (below max threshold), apply additional criteria if (b->rank <= COMPONENT_RANK_SELECTION_MAX) { - scanlog("path lenght: %s - %d vs %s - %d\n", b->file, b->path_depth, a->file, a->path_depth); - //shorter path lenght are prefered - if (b->path_depth < a->path_depth/2) - { - scanlog("%s accepted by shorter path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); - return true; - } - else if (a->path_depth < b->path_depth/2) - { - scanlog("%s rejected by longer path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); - return false; - } + bool same_component = !strcmp(a->component, b->component); - if(b->path_depth > a->path_depth+1) + // 7.4.1. Same component comparison - prefer better ranked purl + // When comparing different sources of the same component, rank is decisive + if (same_component && b->rank != a->rank) { - scanlog("%s rejected by deeper path in rank selection %d > %d\n", b->purls[0], b->path_depth, a->path_depth); + if (b->rank < a->rank) + { + scanlog("%s wins %s by rank %d/%d\n", b->purls[0], a->purls[0], b->rank, a->rank); + return true; + } + scanlog("%s rejected by rank %d\n", b->purls[0], b->rank); return false; } - if (b->rank < a->rank) + // 7.4.2. Path depth comparison for similar ranks + // When ranks are close (difference < 5), prefer shorter file paths + // Shorter paths often indicate more direct/canonical locations + if (abs(b->rank - a->rank) < 5) { - scanlog("%s wins %s by rank %d/%d\n", b->purls[0], a->purls[0], b->rank, a->rank); - return true; + scanlog("path lenght: %s - %d vs %s - %d\n", b->file, b->path_depth, a->file, a->path_depth); + // Component 'b' has significantly shorter path (less than half of 'a') + if (b->path_depth + 2 < a->path_depth/2) + { + scanlog("%s accepted by shorter path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); + return true; + } + // Component 'a' has significantly shorter path (less than half of 'b') + if (a->path_depth + 2 < b->path_depth/2) + { + scanlog("%s rejected by longer path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); + return false; + } } - else if (b->rank > a->rank) + // 7.4.3. Final rank comparison if no other criteria applied + if (b->rank != a->rank) { + if (b->rank < a->rank) + { + scanlog("%s wins %s by rank %d/%d\n", b->purls[0], a->purls[0], b->rank, a->rank); + return true; + } scanlog("%s rejected by rank %d\n", b->purls[0], b->rank); return false; } } } - /*if the relese date is the same untie with the component age (purl)*/ + // 8. Tiebreakers for equal release dates + // When release dates are identical, use additional criteria to select the best component if (!strcmp(b->release_date, a->release_date)) - { - if (purl_source_check(a) > purl_source_check(b)) + { + // 8.1. Source check + // Prefer components from more authoritative sources (official repos, etc.) + // Lower source value is better (prefer_higher = false) + int source_a = purl_source_check(a); + int source_b = purl_source_check(b); + int source_cmp = compare_int_values(source_a, source_b, false); + + if (source_cmp > 0) { scanlog("%s accepted over %s by source check\n", b->purls[0], a->purls[0]); return true; } - else if (purl_source_check(b) > purl_source_check(a)) + if (source_cmp < 0) { scanlog("%s rejected by source check\n", b->purls[0]); return false; } - //Look for available health information + // 8.2. Health information + // Prefer components from healthier projects (more forks + watchers) + // Higher health value is better (prefer_higher = true) print_health(a); print_health(b); - int health_a = a->health_stats[0] + a->health_stats[2]; //add forks and watchers + int health_a = a->health_stats[0] + a->health_stats[2]; // forks + watchers int health_b = b->health_stats[0] + b->health_stats[2]; + int health_cmp = compare_int_values(health_a, health_b, true); - - if (health_b > health_a) + if (health_cmp > 0) { scanlog("Component prefered by health: %s = %d vs %s = %d\n", b->purls[0], health_b, a->purls[0], health_a); return true; } - else if (health_a > health_b) - { + if (health_cmp < 0) return false; - } - - if (!purl_vendor_component_check(a) && purl_vendor_component_check(b)) - { - scanlog("Component %s prefered over %s by vendor+component=purl\n", b->purls[0], a->purls[0]); - return true; - } - else if (purl_vendor_component_check(a) && !purl_vendor_component_check(b)) + // 8.3. Vendor component check + // Verify if vendor and component names align with the purl + bool vendor_check_a = purl_vendor_component_check(a); + bool vendor_check_b = purl_vendor_component_check(b); + + if (vendor_check_a != vendor_check_b) { + if (vendor_check_b) + { + scanlog("Component %s prefered over %s by vendor+component=purl\n", b->purls[0], a->purls[0]); + return true; + } scanlog("Component %s rejected, %s wins by vendor+component=purl\n", b->purls[0], a->purls[0]); return false; } - - if (!a->purls_md5[0] && a->purls[0]) - { - a->purls_md5[0] = malloc(MD5_LEN); - MD5((uint8_t *)a->purls[0], strlen(a->purls[0]), a->purls_md5[0]); - a->age = get_component_age(a->purls_md5[0]); - } - - if (!b->purls_md5[0] && b->purls[0]) - { - b->purls_md5[0] = malloc(MD5_LEN); - MD5((uint8_t *)b->purls[0], strlen(b->purls[0]), b->purls_md5[0]); - b->age = get_component_age(b->purls_md5[0]); - } - + + // 8.4. Component age (lazy initialization) + // Prefer older components (first appearance in package repositories) + // Higher age value means the component was published earlier + initialize_component_age(a); + initialize_component_age(b); + if ((!a->age && b->age) || b->age > a->age) { scanlog("Component %s prefered over %s by purl date (age: %ld vs %ld)\n", b->purls[0], a->purls[0], b->age, a->age); return true; } - else if ((!b->age && a->age) || a->age > b->age) + if ((!b->age && a->age) || a->age > b->age) { scanlog("Component %s rejected by purl date (age: %ld vs %ld)\n", b->purls[0], b->age, a->age); return false; } - if (b->age == a->age && !strcmp(a->component, b->component) && strcmp(a->version, b->version) > 0) - { - scanlog("Component %s prefered over %s by version\n", b->purls[0], a->purls[0]); - return true; - } - else if (b->age == a->age && !strcmp(a->component, b->component) && strcmp(b->version, a->version) > 0) + // 8.5. Version comparison (only if same component and age) + // For the same component with same age, use lexicographic version comparison + // Lower version string is preferred (usually represents older/more stable versions) + if (b->age == a->age && !strcmp(a->component, b->component)) { - scanlog("Component %s rejected by version comparison\n", b->purls[0]); - return false; + int version_cmp = strcmp(a->version, b->version); + if (version_cmp > 0) + { + scanlog("Component %s prefered over %s by version\n", b->purls[0], a->purls[0]); + return true; + } + if (version_cmp < 0) + { + scanlog("Component %s rejected by version comparison\n", b->purls[0]); + return false; + } } } - /*select the oldest release date */ - if (strcmp(b->release_date, a->release_date) < 0) + // 9. Final decision: Select the oldest release date + // When no other criteria has decided, prefer the component with the earlier release date + // This implements the fundamental principle of preferring older, more established versions + int date_cmp = strcmp(b->release_date, a->release_date); + if (date_cmp < 0) { + // Component 'b' has an earlier release date (date_cmp < 0 means b->release_date < a->release_date) scanlog("Component %s (rank %d) prefered over %s (rank %d) by release date\n", b->purls[0],b->rank, a->purls[0], a->rank); return true; } - else if (strcmp(b->release_date, a->release_date) > 0) + if (date_cmp > 0) { + // Component 'a' has an earlier release date scanlog("Component %s (rank %d) rejected, %s (rank %d) wins by older release date\n", b->purls[0], b->rank, a->purls[0], a->rank); return false; } + // No criteria matched or all criteria were equal - reject component 'b' scanlog("Component %s rejected, no criteria matched\n", b->purls[0]); return false; } @@ -539,12 +717,12 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, * @return false */ /*Iterations must be doubled if high accuracy is enabled*/ -int iteration_max = FETCH_MAX_FILES; +int iteration_max = DEFAULT_MATCHMAP_FILES; bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { /*Iterations must be doubled if high accuracy is enabled*/ if (iteration == 0) - iteration_max = ((engine_flags & ENABLE_HIGH_ACCURACY) ? FETCH_MAX_FILES * 4 : FETCH_MAX_FILES); + iteration_max = ((engine_flags & ENABLE_HIGH_ACCURACY) ? fetch_max_files * 4 : fetch_max_files); /*Return we high accuracy it is not enabled*/ if (iteration > iteration_max) @@ -588,10 +766,8 @@ bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * */ bool load_matches(match_data_t *match) { - scanlog("Load matches\n"); - + scanlog("Loading matches - fetch_max_files: %d\n", fetch_max_files); - if (match->type == MATCH_BINARY) { asprintf(&match->line_ranges, "n/a"); @@ -933,6 +1109,11 @@ void compile_matches(scan_data_t *scan) scan->matches_list_array[0] = match_list_init(true, scan->max_snippets_to_process); scan->matches_list_array_index = 1; match_data_t *match_new = calloc(1, sizeof(match_data_t)); + if (!match_new) + { + scanlog("Error allocating memory for match data\n"); + return; + } match_new->type = scan->match_type; strcpy(match_new->source_md5, scan->source_md5); memcpy(match_new->file_md5, scan->match_ptr, MD5_LEN); diff --git a/src/match_list.c b/src/match_list.c index 9df35af..134e429 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -96,12 +96,12 @@ bool component_list_add(component_list_t *list, component_data_t *new_comp, bool { if (list->items >= list->max_items) return false; - + struct comp_entry *nn = calloc(1, sizeof(struct comp_entry)); /* Insert after. */ - nn->component = new_comp; + nn->component = new_comp; LIST_INSERT_AFTER(list->last_element, nn, entries); list->last_element_aux = list->last_element; - list->last_element = nn; + list->last_element = nn; list->items++; return true; } diff --git a/src/report.c b/src/report.c index 190f8d5..fb1cc37 100644 --- a/src/report.c +++ b/src/report.c @@ -48,6 +48,7 @@ uint64_t engine_flags = 0; char kb_version[MAX_INPUT]; +static bool ranking_enabled = false; /** * @brief Open JSON report @@ -213,6 +214,15 @@ bool print_json_component(component_data_t * component) printf("{"); else printf(","); + //if the component is filtered just report the rank without extra details. + if (component->identified == IDENTIFIED_FILTERED) + { + printf("\"status\": \"filtered\""); + printf(",\"rank\": %d", component->rank); + if (engine_flags & DISABLE_BEST_MATCH) + printf("}"); + return false; + } /* Fetch related purls */ fetch_related_purls(component); @@ -254,6 +264,10 @@ bool print_json_component(component_data_t * component) printf(",%s", json_remove_invalid_char(component->license_text)); } + if (ranking_enabled) + printf(",\"rank\": %d", component->rank); + + if (!(engine_flags & DISABLE_HEALTH)) { if (!component->health_text) @@ -308,6 +322,7 @@ bool print_json_component(component_data_t * component) bool print_json_match(struct match_data_t * match) { + if (!match->component_list.headp.lh_first) { scanlog("Match with no components ignored: %s", match->source_md5); @@ -318,7 +333,13 @@ bool print_json_match(struct match_data_t * match) if (engine_flags & DISABLE_BEST_MATCH) printf("{"); - printf("\"id\": \"%s\"", matchtypes[match->type]); + if (match->scan_ower->component_ranking_threshold >= 0) + ranking_enabled = true; + + printf("\"id\": \"%s\"", matchtypes[match->type]); + if (!match->scan_ower->snippet_adjust_tolerance && match->type == MATCH_SNIPPET) + printf(",\"hits\": %d", match->hits); + printf(",\"lines\": \"%s\"", match->line_ranges); printf(",\"oss_lines\": \"%s\"", match->oss_ranges); printf(",\"matched\": \"%d%%\"", match->matched_percent); diff --git a/src/scan.c b/src/scan.c index 84401fa..33c85ec 100644 --- a/src/scan.c +++ b/src/scan.c @@ -53,7 +53,7 @@ char *ignored_assets = NULL; @param target File to scan @return Scan data */ -scan_data_t * scan_data_init(char *target, int max_snippets, int max_components) +scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension) { scanlog("Scan Init\n"); scan_data_t * scan = calloc(1, sizeof(*scan)); @@ -62,8 +62,13 @@ scan_data_t * scan_data_init(char *target, int max_snippets, int max_components) scan->hashes = calloc(MAX_FILE_SIZE,1); scan->lines = malloc(MAX_FILE_SIZE); scan->match_type = MATCH_NONE; - + scan->component_ranking_threshold = component_ranking_threshold; + scan->snippet_adjust_tolerance = adjust_tolerance; + scan->snippet_min_hits = snippet_min_hits; + scan->snippet_min_lines = snippet_min_lines; + scan->snippet_honor_file_extension = snippet_honor_file_extension; scan->max_components_to_process = max_components; + scan->snippet_range_tolerance = snippet_range_tolerance > 0 ? snippet_range_tolerance : 1; scan->max_snippets_to_process = max_snippets > MAX_MULTIPLE_COMPONENTS ? MAX_MULTIPLE_COMPONENTS : max_snippets; scan->max_snippets_to_process = scan->max_snippets_to_process == 0 ? 1 : scan->max_snippets_to_process; @@ -189,9 +194,9 @@ int asset_declared(component_data_t * comp) * @param scan Scan data * @return Scan result (SUCCESS/FAILURE) |**/ -int hash_scan(char *path, int scan_max_snippets, int scan_max_components) +int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension) { - scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components); + scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components, adjust_tolerance, component_ranking_threshold, snippet_min_hits, snippet_min_lines, snippet_range_tolerance, snippet_honor_file_extension); scan->preload = true; /* Get file MD5 */ @@ -216,7 +221,7 @@ int hash_scan(char *path, int scan_max_snippets, int scan_max_components) * @param scan_max_components Limit for component to be displayed. 1 by default. * @return EXIT_SUCCESS */ -int wfp_scan(char * path, int scan_max_snippets, int scan_max_components) +int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension) { scan_data_t * scan = NULL; char * line = NULL; @@ -303,7 +308,7 @@ int wfp_scan(char * path, int scan_max_snippets, int scan_max_components) } /*Init a new scan object for the next file to be scanned */ - scan = scan_data_init(target, scan_max_snippets, scan_max_components); + scan = scan_data_init(target, scan_max_snippets, scan_max_components, adjust_tolerance, component_ranking_threshold, snippet_min_hits, snippet_min_lines, snippet_range_tolerance, snippet_honor_file_extension); strcpy(scan->source_md5, tmp_md5_hex); extract_csv(scan->file_size, (char *)rec, 1, LDB_MAX_REC_LN); scan->preload = true; @@ -449,14 +454,17 @@ void ldb_scan(scan_data_t *scan) exit(EXIT_FAILURE); } - // Clean up the log file - if (debug_on) - scanlog_init(); - scan->matchmap_size = 0; scan->match_type = MATCH_NONE; scan->timer = microseconds_now(); + if (scan->component_ranking_threshold < 0) + component_rank_max = -1; // disable ranking + else if (scan->component_ranking_threshold == 0) + component_rank_max = COMPONENT_DEFAULT_RANK + 1; // all accepted + else + component_rank_max = scan->component_ranking_threshold; + /* Get file length */ uint64_t file_size = 0; diff --git a/src/snippet_selection.c b/src/snippet_selection.c index 0d29172..f3cd640 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -121,26 +121,30 @@ void biggest_snippet(scan_data_t *scan) if (j < 0) continue; - if (scan->matchmap[j].hits >= min_match_hits) /* Only consider file with more than min_match_hits */ + if (scan->matchmap[j].hits >= scan->snippet_min_hits) /* Only consider file with more than min_match_hits */ { match_data_t *match_new = calloc(1, sizeof(match_data_t)); /* Create a match object */ + if (!match_new) + { + scanlog("Error allocating memory for match data\n"); + return; + } memcpy(match_new->file_md5, scan->matchmap[j].md5, oss_file.key_ln); match_new->hits = scan->matchmap[j].hits; match_new->matchmap_reg = &scan->matchmap[j]; match_new->type = scan->match_type; - match_new->from = scan->matchmap[j].range->from; strcpy(match_new->source_md5, scan->source_md5); match_new->scan_ower = scan; int i = 0; - if (snippet_extension_discard(match_new)) + if (scan->snippet_honor_file_extension && snippet_extension_discard(match_new)) { match_data_free(match_new); continue; } int matched_lines = compile_ranges(match_new); - if (matched_lines < min_match_lines) { + if (matched_lines < scan->snippet_min_lines) { match_data_free(match_new); continue; } @@ -243,11 +247,11 @@ void add_snippet_ids(match_data_t *match, char *snippet_ids, long from, long to) * @param scan[out] pointer to scan data * @return hits */ -int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges) +int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges, int min_range_lines, int ranges_number) { int out = 0; /* Walk ranges */ - for (int i = 0; i < MATCHMAP_RANGES; i++) + for (int i = 0; i < ranges_number; i++) { int to = ranges[i].to; int from = ranges[i].from; @@ -257,14 +261,14 @@ int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges) { if (from == 0) from = 1; + //discard snippets below the limit of detection + if (to - from < min_range_lines) + continue; /* Add commas unless it is the first range */ if (*line_ranges) strcat(line_ranges, ","); if (*oss_ranges) strcat(oss_ranges, ","); - //discard snippets below the limit of detection - if (to - from < min_match_lines) - continue; /* Add from-to values */ sprintf(line_ranges + strlen(line_ranges), "%d-%d", from, to); @@ -291,9 +295,15 @@ int range_comp(const void *a, const void *b) * @brief Join overlapping ranges * @param ranges ranges list to process */ -matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) +matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int range_tolerance, bool dynamic_ranges) { - matchmap_range *out_ranges = malloc(sizeof(matchmap_range) * MATCHMAP_RANGES); + int out_size = MATCHMAP_RANGES; + if (dynamic_ranges) + out_size = size; + + matchmap_range *out_ranges = calloc(out_size, sizeof(matchmap_range)); + if (!out_ranges) + return NULL; int processed = 0; int tolerance = range_tolerance > 0 ? range_tolerance : 1; @@ -301,8 +311,7 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) { int out_ranges_index = -1; processed = 0; - out_ranges[0] = ranges[0]; - memset(out_ranges, 0, sizeof(matchmap_range) * MATCHMAP_RANGES); + memset(out_ranges, 0, sizeof(matchmap_range) * out_size); scanlog("Range tolerance: %d\n", tolerance); for (int i = 0; i < size; i++) { @@ -319,7 +328,7 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) else { out_ranges_index++; - if (out_ranges_index == MATCHMAP_RANGES) + if (out_ranges_index == MATCHMAP_RANGES && !dynamic_ranges) break; out_ranges[out_ranges_index].from = ranges[i].from; out_ranges[out_ranges_index].to = ranges[i].to; @@ -328,6 +337,8 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) processed++; } } + if (dynamic_ranges) + break; tolerance *= 2; } @@ -352,7 +363,11 @@ void ranges_sort(matchmap_range *ranges, int size) */ uint32_t compile_ranges(match_data_t *match) { - + if (match->matchmap_reg->ranges_number <= 0) + { + scanlog("No ranges to compile\n"); + return 0; + } char line_ranges[MAX_FIELD_LN * 2] = "\0"; char oss_ranges[MAX_FIELD_LN * 2] = "\0"; char snippet_ids[MAX_SNIPPET_IDS_RETURNED * WFP_LN * 2 + MATCHMAP_RANGES + 1] = "\0"; @@ -368,20 +383,23 @@ uint32_t compile_ranges(match_data_t *match) if (debug_on) { - scanlog("Accepted ranges (min lines range = %d):\n", min_match_lines); + scanlog("Accepted ranges (min lines range = %d):\n", match->scan_ower->snippet_min_lines); for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) { if ( match->matchmap_reg->range[i].from && match->matchmap_reg->range[i].to) - scanlog(" %d = %ld to %ld - OSS from: %d\n", i, match->matchmap_reg->range[i].from,match->matchmap_reg->range[i].to, + scanlog(" %d = %u to %u - OSS from: %d\n", i, match->matchmap_reg->range[i].from,match->matchmap_reg->range[i].to, match->matchmap_reg->range[i].oss_line); } } - - matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number); - + //TODO: Re-enable dynamic ranges when feature is complete + // For now, we force adjust_tolerance to ensure stable behavior + match->scan_ower->snippet_adjust_tolerance = true; + scanlog("Snippet adjust tolerance flag is being ignored\n"); + matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number, match->scan_ower->snippet_range_tolerance, !match->scan_ower->snippet_adjust_tolerance); + int ranges_number = !match->scan_ower->snippet_adjust_tolerance ? match->matchmap_reg->ranges_number : MATCHMAP_RANGES; if (engine_flags & ENABLE_SNIPPET_IDS) { - for (int range = 0; range < MATCHMAP_RANGES; range++) + for (int range = 0; range < ranges_number; range++) { if (!ranges[range].from && !ranges[range].to) break; @@ -393,13 +411,13 @@ uint32_t compile_ranges(match_data_t *match) if (debug_on) { scanlog("Final ranges:\n"); - for (uint32_t i = 0; i < MATCHMAP_RANGES; i++) + for (uint32_t i = 0; i < ranges_number; i++) { if ( ranges[i].from && ranges[i].to) - scanlog(" %d = %ld to %ld - OSS from: %d\n", i, ranges[i].from, ranges[i].to, ranges[i].oss_line); + scanlog(" %d = %u to %u - OSS from: %u\n", i, ranges[i].from, ranges[i].to, ranges[i].oss_line); } } - hits = ranges_assemble(ranges, line_ranges, oss_ranges); + hits = ranges_assemble(ranges, line_ranges, oss_ranges, match->scan_ower->snippet_min_lines, ranges_number); match->line_ranges = strdup(line_ranges); match->oss_ranges = strdup(oss_ranges); match->snippet_ids = strdup(snippet_ids); diff --git a/src/snippets.c b/src/snippets.c index c25c366..7c5f9ab 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -95,11 +95,7 @@ bool skip_snippets(char *src, uint64_t srcln) scanlog("Skipping snippets: Binary file\n"); return true; // is binary } - /*if (unwanted_header(src)) - { - scanlog("Skipping snippets: Ignored contents\n"); - return true; - }*/ + return false; } @@ -109,20 +105,13 @@ bool skip_snippets(char *src, uint64_t srcln) */ static void adjust_tolerance(scan_data_t *scan) { - bool skip = false; uint32_t wfpcount = scan->hash_count; + int range_tolerance = SNIPPETS_DEFAULT_RANGE_TOLERANCE; /** A maximum number of non-matched lines tolerated inside a matching range */ + int min_match_lines = SNIPPETS_DEFAULT_MIN_MATCH_LINES; /** Minimum number of lines matched for a match range to be acepted */ + int min_match_hits = SNIPPETS_DEFAULT_MIN_MATCH_HITS; /** Minimum number of snippet ID hits to produce a snippet match*/ - if (!wfpcount) - skip = true; - else if (scan->lines[wfpcount - 1] < 10) - skip = true; - - if (skip) - { - min_match_lines = 5; - min_match_hits = 2; - } - else + + if (wfpcount && scan->lines[wfpcount - 1] > SNIPPETS_DEFAULT_MIN_MATCH_LINES * 2) { /* Range tolerance is the maximum amount of non-matched lines accepted within a matched range. This goes from 21 in small files to 5 in large files */ @@ -142,7 +131,9 @@ static void adjust_tolerance(scan_data_t *scan) if (min_match_hits > 9) min_match_hits = 9; } - + scan->snippet_min_hits = min_match_hits; + scan->snippet_min_lines = min_match_lines; + scan->snippet_range_tolerance = range_tolerance; scanlog("Match hits: %d, Tolerance: range=%d, lines=%d, wfpcount=%u\n", min_match_hits, range_tolerance, min_match_lines, wfpcount); } @@ -278,7 +269,7 @@ int add_file_to_matchmap(scan_data_t *scan, matchmap_entry_t *item, uint8_t *md5 } /* Increase range */ - else if (gap < range_tolerance) + else if (gap < scan->snippet_range_tolerance) { range_found = true; /* Update range start (from) */ @@ -338,7 +329,8 @@ match_t ldb_scan_snippets(scan_data_t *scan) return MATCH_NONE; matchmap_setup(scan); - adjust_tolerance(scan); + if (scan->snippet_adjust_tolerance) + adjust_tolerance(scan); /* First build a map with all the MD5s related with each WFP from the source file*/ matchmap_entry_t map[scan->hash_count]; @@ -381,7 +373,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) memset(map_indirection_index, 0, sizeof(map_indirection_index)); scanlog ("< Snippet scan setup: Total lines: %d ,Matchmap size: %d, Min hits: %d, Min lines: %d, Map max size = %d, Cat N = %d x %d, Cat size = %d >\n", - scan->total_lines, scan->max_matchmap_size, min_match_hits, min_match_lines, map_max_size, MAP_INDIRECTION_CAT_NUMBER, map_indedirection_items_size, MAP_INDIRECTION_CAT_SIZE); + scan->total_lines, scan->max_matchmap_size, scan->snippet_min_hits, scan->snippet_min_lines, map_max_size, MAP_INDIRECTION_CAT_NUMBER, map_indedirection_items_size, MAP_INDIRECTION_CAT_SIZE); for (int i =0; i < scan->hash_count; i++) { @@ -484,7 +476,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) for (int sector = 0; sector < 256; sector++) { scan->matchmap_rank_by_sector[sector] = -1; - int sector_max = min_match_hits; + int sector_max = scan->snippet_min_hits; for (int cat = 0; cat < cat_limit_index; cat++) { /* travel the cathegories map*/ @@ -555,7 +547,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) { int wfp_p = wfp_index * WFP_REC_LN; int sector = md5s[wfp_p]; - int sector_max = min_match_hits; + int sector_max = scan->snippet_min_hits; if (scan->matchmap_rank_by_sector[sector] < 0) continue; diff --git a/src/url.c b/src/url.c index 027cd88..cbb21c1 100644 --- a/src/url.c +++ b/src/url.c @@ -51,6 +51,8 @@ * @param ptr //TODO * @return //TODO */ +int component_rank_max = COMPONENT_DEFAULT_RANK + 1; /*Used defined max component rank accepted*/ + bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { if (!datalen && datalen >= MAX_PATH) return false; @@ -71,23 +73,31 @@ bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra component_data_t * new_comp = calloc(1, sizeof(*new_comp)); bool result = fill_component(new_comp, NULL, NULL, (uint8_t*) data); scanlog("URL MATCH: %s\n", data); - if (result && new_comp->rank <= component_rank_max) + if (!result) + { + component_data_free(new_comp); + } + else { /* Save match component id */ memcpy(new_comp->url_md5, key, LDB_KEY_LN); memcpy(new_comp->url_md5 + LDB_KEY_LN, subkey, subkey_ln); new_comp->url_match = true; - new_comp->file = strdup(new_comp->url); + char * file_name = strdup(new_comp->url); + new_comp->file = strdup(basename(file_name)); + free(file_name); new_comp->file_md5_ref = component_list->match_ref->file_md5; new_comp->identified = IDENTIFIED_NONE; asset_declared(new_comp); + if (component_rank_max > 0 && new_comp->rank > component_rank_max) + { + scanlog("Setting component with rank %d as filtered\n", new_comp->rank); + new_comp->identified = IDENTIFIED_FILTERED; + } + component_list_add(component_list, new_comp, component_date_comparation, true); } - else - component_data_free(new_comp); - free(data); - return false; } @@ -328,6 +338,8 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, if (!url) return false; + //scanlog("url: %s\n", url); + /* Get oldest */ component_data_t **comp_address = ptr; component_data_t * comp_oldest = *comp_address; @@ -336,8 +348,9 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, { component_data_t * comp = calloc(1, sizeof(*comp)); bool result = fill_component(comp, key, NULL, (uint8_t *)url); - if (!result || comp->rank > component_rank_max) + if (!result) { + scanlog("ignoring component with rank %d\n", comp->rank); free(url); component_data_free(comp); return false; @@ -346,6 +359,13 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, asset_declared(comp); purl_latest_version_add(comp); + if (component_rank_max > 0 && comp->rank > component_rank_max) + { + scanlog("Setting component with rank %d as filtered\n", comp->rank); + comp->identified = IDENTIFIED_FILTERED; + } + + if (!comp_oldest) { *comp_address = comp; diff --git a/src/util.c b/src/util.c index 8033fe5..93cdc21 100644 --- a/src/util.c +++ b/src/util.c @@ -339,8 +339,8 @@ int path_is_third_party(component_data_t *comp) if (!comp->file) return 0; - - char * path = comp->file; + char * full_path = strdup(comp->file); + char * path = dirname(full_path); const char* patterns[] = { // Explicit third-party naming @@ -360,9 +360,8 @@ int path_is_third_party(component_data_t *comp) // Build/dependency management directories "external", // Maven, CMake external dependencies - "externals", // Alternative "dependencies", // Generic dependency directories - "dep", // Short form + "deps", // Short form "packages", // NuGet, Generic (covers packages.lock) // Language-specific package directories @@ -379,33 +378,37 @@ int path_is_third_party(component_data_t *comp) "imported", // Imported code "foreign", // Foreign code - // Build output that may contain third-party - "dist", // Distribution builds - "release", // Release builds - "bundle", // Bundled dependencies - // Contribution/extension directories "contrib", // Contributed/third-party code "plugin", // Plugins (often third-party) - "utils","lib", "components", "modules", "ext", - "fixtures", "examples", - "files", "assets", "runtime", + "utils", "components", "modules", "ext", + "fixtures", "examples","assets", "runtime", "subprojects", "managed", "local_packages", "published", - "driver", "libresources", "offloading","documentation", "test" + "libresources", "offloading", "compile", "release", "bundle", + "media", "documentation", "test", + "service","lib","dist", + "driver", "common","files" }; const int numPatterns = sizeof(patterns) / sizeof(patterns[0]); + if (!strcmp(path, ".")) + { + free(full_path); + return numPatterns; + } + for (int i = 0; i < numPatterns; i++) { if (strcasestr(path, patterns[i]) != NULL) { - return i; + free(full_path); + return i; } } - - return numPatterns + 1; + free(full_path); + return numPatterns; } /** diff --git a/src/versions.c b/src/versions.c index e6f4526..c6bd7fe 100644 --- a/src/versions.c +++ b/src/versions.c @@ -45,13 +45,13 @@ #include "versions.h" -static char * purl_indirection_reference[FETCH_MAX_FILES]; +static char * purl_indirection_reference[FETCH_MAX_FILES_DEFAULT]; static int purl_indirection_index = 0; -static release_version * purl_version_list[FETCH_MAX_FILES]; +static release_version * purl_version_list[FETCH_MAX_FILES_DEFAULT]; void purl_latest_version_add(component_data_t * component) { - if (!component->purls[0] || !component->release_date || !component->version || purl_indirection_index == FETCH_MAX_FILES) + if (!component->purls[0] || !component->release_date || !component->version || purl_indirection_index == FETCH_MAX_FILES_DEFAULT) return; for (int i = 0; i < purl_indirection_index; i++) diff --git a/src/vulnerability.c b/src/vulnerability.c index 3765014..5673023 100644 --- a/src/vulnerability.c +++ b/src/vulnerability.c @@ -40,6 +40,7 @@ #include "versions.h" /** @brief //TODO */ const char *vulnerability_sources[] = {"nvd", "github_advisories"}; +const int max_vulnerabilities = 50; /** Show only the first N vulnerabilities */ /**