From 8b7abe8bbf58169bd424e5b65e5440bd9cc9adac Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Thu, 27 Nov 2025 13:48:50 +0000 Subject: [PATCH 01/20] fix bug processing third party hint --- src/util.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/util.c b/src/util.c index 8033fe5..00b03e7 100644 --- a/src/util.c +++ b/src/util.c @@ -340,7 +340,7 @@ int path_is_third_party(component_data_t *comp) if (!comp->file) return 0; - char * path = comp->file; + char * path = dirname(comp->file); const char* patterns[] = { // Explicit third-party naming @@ -397,6 +397,9 @@ int path_is_third_party(component_data_t *comp) const int numPatterns = sizeof(patterns) / sizeof(patterns[0]); + if (!strcmp(path, comp->file)) + return numPatterns; + for (int i = 0; i < numPatterns; i++) { if (strcasestr(path, patterns[i]) != NULL) From b60488eee2527dd624451e37ed81e071735722d3 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Thu, 27 Nov 2025 15:35:17 +0000 Subject: [PATCH 02/20] add long name parameters, update help --- src/help.c | 54 +++++++++++++++++++++++++++++++----------------------- src/main.c | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/src/help.c b/src/help.c index bb90b46..a42dca1 100644 --- a/src/help.c +++ b/src/help.c @@ -32,6 +32,8 @@ #include "help.h" #include "scanoss.h" #include "limits.h" +#include "match_list.h" +#include "component.h" /** * @brief Print the help @@ -46,34 +48,37 @@ Results are displayed in JSON format through STDOUT.\n\ Syntax: scanoss [parameters] [TARGET]\n\ \n\ Configuration:\n\ --w Process TARGET as a .wfp file, regardless of its actual extension.\n\ --H Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system).\n\ --e Match only files with identical extensions as the scanned file (default: off).\n\ --M NUMBER Search for up to NUMBER different components in each file (maximum: 9).\n\ --T NUMBER Set snippet scanning tolerance percentage (default: 0.1).\n\ --s SBOM Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ --b SBOM Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ --B SBOM Same as \"-b\" but with forced snippet scanning.\n\ --a SBOM Show attribution notices for the provided SBOM.json file.\n\ --c HINT Add a component HINT to guide scan results.\n\ --k KEY Show contents of the specified KEY file from MZ sources archive.\n\ --l LICENSE Display OSADL metadata for the given SPDX license ID.\n\ --L Enable license full reort.\n\ +-w, --wfp Process TARGET as a .wfp file, regardless of its actual extension.\n\ +-H, --hpsm Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system).\n\ +-e, --extension Match only files with identical extensions as the scanned file (default: off).\n\ +-M, --max-snippets NUM Search for up to NUM different components in each file (maximum: 9).\n\ +-N, --max-components NUM Set maximum number of components (default: %d).\n\ +-T, --tolerance NUM Set snippet scanning tolerance percentage (default: 0.1).\n\ +-r, --rank NUM Set maximum component rank accepted (default: %d).\n\ +-s, --sbom FILE Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ +-b, --blacklist FILE Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ + --force-snippet FILE Force snippet scanning even for full file matches.\n\ +-a, --attribution FILE Show attribution notices for the provided SBOM.json file.\n\ +-c, --component HINT Add a component HINT to guide scan results.\n\ +-k, --key KEY Show contents of the specified KEY file from MZ sources archive.\n\ +-l, --license LICENSE Display OSADL metadata for the given SPDX license ID.\n\ +-L, --full-license Enable full license report.\n\ +-F, --flags FLAGS Set engine scanning flags (see below).\n\ \n\ Options:\n\ --t Run engine performance tests.\n\ --v Show version information and exit.\n\ --n Set database name (default: oss).\n\ --h Display this help information and exit.\n\ --d Store debugging information to disk (/tmp).\n\ --q Suppress JSON output (show only debugging info via STDERR).\n\ +-t, --test Run engine performance tests.\n\ +-v, --version Show version information and exit.\n\ +-n, --name NAME Set database name (default: oss).\n\ +-h, --help Display this help information and exit.\n\ +-d, --debug Store debugging information to disk (/tmp).\n\ +-q, --quiet Suppress JSON output (show only debugging info via STDERR).\n\ \n\ Environment variables:\n\ SCANOSS_MATCHMAP_MAX: Set the snippet scanning match map size (default: %d).\n\ -SCANOSS_FILE_CONTENTS_URL: Define the API URL endpoint for sources. Source url wont be reported if it's not defined.\n\ +SCANOSS_FILE_CONTENTS_URL: Define the API URL endpoint for sources. Source URL won't be reported if not defined.\n\ \n\ Engine scanning flags:\n\ -Configure the scanning engine using flags with the -F parameter.\n\ +Configure the scanning engine using flags with the -F/--flags parameter.\n\ These settings can also be specified in %s\n\ +-------+-------------------------------------------------------+\n\ | Flag | Setting |\n\ @@ -94,7 +99,10 @@ These settings can also be specified in %s\n\ | 8192 | Disable health layer (default: enabled) |\n\ | 16384 | Enable high accuracy, slower scan (default: disabled) |\n\ +-------+-------------------------------------------------------+\n\ -Example: scanoss -F 12 DIRECTORY (scan DIRECTORY without license and dependency data)\n\ +Examples:\n\ + scanoss -F 12 DIRECTORY Scan DIRECTORY without license and dependency data\n\ + scanoss --flags 12 DIRECTORY Same as above using long option\n\ + scanoss --sbom my_sbom.json TARGET Scan TARGET including SBOM assets\n\ \n\ -Copyright (C) 2018-2022 SCANOSS.COM\n", DEFAULT_MATCHMAP_FILES, ENGINE_FLAGS_FILE); +Copyright (C) 2018-2022 SCANOSS.COM\n", SCAN_MAX_COMPONENTS_DEFAULT, COMPONENT_DEFAULT_RANK + 1, DEFAULT_MATCHMAP_FILES, ENGINE_FLAGS_FILE); } diff --git a/src/main.c b/src/main.c index b163889..b88bf90 100644 --- a/src/main.c +++ b/src/main.c @@ -46,6 +46,7 @@ #include #include "hpsm.h" #include +#include struct ldb_table oss_url; struct ldb_table oss_file; @@ -263,6 +264,34 @@ uint64_t read_flags() int component_rank_max = COMPONENT_DEFAULT_RANK + 1; /*Used defined max component rank accepted*/ + +/* Long options structure for getopt_long */ +static struct option long_options[] = { + {"rank", required_argument, 0, 'r'}, + {"tolerance", required_argument, 0, 'T'}, + {"sbom", required_argument, 0, 's'}, + {"blacklist", required_argument, 0, 'b'}, + {"force-snippet", required_argument, 0, 256}, /* Long option only, no short form */ + {"component", required_argument, 0, 'c'}, + {"key", required_argument, 0, 'k'}, + {"attribution", required_argument, 0, 'a'}, + {"flags", required_argument, 0, 'F'}, + {"license", required_argument, 0, 'l'}, + {"full-license", no_argument, 0, 'L'}, + {"name", required_argument, 0, 'n'}, + {"max-snippets", required_argument, 0, 'M'}, + {"max-components", required_argument, 0, 'N'}, + {"wfp", no_argument, 0, 'w'}, + {"test", no_argument, 0, 't'}, + {"version", no_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"extension", no_argument, 0, 'e'}, + {"debug", no_argument, 0, 'd'}, + {"quiet", no_argument, 0, 'q'}, + {"hpsm", no_argument, 0, 'H'}, + {0, 0, 0, 0} +}; + /** * @brief //TODO * @param argc //TODO @@ -291,9 +320,10 @@ int main(int argc, char **argv) /* Parse arguments */ int option; + int option_index = 0; bool invalid_argument = false; char * ldb_db_name = NULL; - while ((option = getopt(argc, argv, ":r:T:s:b:B:c:k:a:F:l:n:M:N:wtLvhedqH")) != -1) + while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:wtLvhedqH", long_options, &option_index)) != -1) { /* Check valid alpha is entered */ if (optarg) @@ -368,8 +398,7 @@ int main(int argc, char **argv) case 'w': force_wfp = true; break; - case 'B': - ignore_components = get_components(optarg); + case 256: /* --force-snippet (long option only) */ force_snippet_scan = true; break; case 't': From b7f3c79a480b606fc177fc44c9dc9464a6a2e71e Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Fri, 28 Nov 2025 10:11:16 +0000 Subject: [PATCH 03/20] add engine extra configuration parameters, small code refactor --- inc/component.h | 35 +++++++++++++++++++++++++++++++++++ inc/limits.h | 6 ++++++ inc/parse.h | 1 + inc/scanoss.h | 40 ---------------------------------------- src/file.c | 6 +++--- src/help.c | 5 ++++- src/limits.c | 1 + src/main.c | 17 ++++++++++++++++- src/match.c | 8 +++----- src/versions.c | 6 +++--- 10 files changed, 72 insertions(+), 53 deletions(-) diff --git a/inc/component.h b/inc/component.h index 5c73f46..9616e48 100644 --- a/inc/component.h +++ b/inc/component.h @@ -2,6 +2,7 @@ #define __COMPONENT_H #include "scanoss.h" +#include "limits.h" #define COMPONENT_DEFAULT_RANK 999 //default rank for components without rank information #define COMPONENT_RANK_SELECTION_MAX 8 //max rank to be considered in component selection @@ -62,6 +63,40 @@ typedef struct component_data_t int third_party_rank; /* Saves third party ranking*/ } component_data_t; +typedef struct keywords +{ + int count; + char word[MAX_FIELD_LN]; +} keywords; + + +typedef struct file_recordset +{ + uint8_t url_id[MD5_LEN]; + char path[MAX_FILE_PATH]; + int path_ln; + bool external; +} file_recordset; + +typedef struct len_rank +{ + int id; + int len; +} len_rank; + +typedef struct component_item +{ + char * vendor; + char * component; + char * purl; + char * version; + char * license; +} component_item; + +extern component_item *ignore_components; +extern component_item *declared_components; + + component_data_t * component_init(void); void component_data_free(component_data_t * data); bool fill_component(component_data_t * component, uint8_t *url_key, char *file_path, uint8_t *url_record); diff --git a/inc/limits.h b/inc/limits.h index 69122a4..ddb8c7c 100644 --- a/inc/limits.h +++ b/inc/limits.h @@ -34,6 +34,11 @@ #define MAX_QUERY_RESPONSE (1024 * 1024 * 8) #define SLOW_QUERY_LIMIT_IN_USEC 2000000 #define MAX_JSON_VALUE_LEN 4096 +#define MAX_FILE_PATH 1024 +#define FETCH_MAX_FILES_DEFAULT 12000 +#define MIN_FILE_SIZE 256 // files below this size will be ignored +#define CRC_LIST_LEN 1024 // list of crc checksums to avoid metadata duplicates +#define SNIPPET_LINE_TOLERANCE 10 /* Snippets */ #define DEFAULT_MATCHMAP_FILES 10000 // Default number of files evaluated in snippet matching @@ -54,5 +59,6 @@ extern int consecutive_threshold; extern int range_tolerance; // A maximum number of non-matched lines tolerated inside a matching range extern int min_match_lines; // Minimum number of lines matched for a match range to be acepted extern int min_match_hits; // Minimum number of snippet ID hits to produce a snippet match +extern int fetch_max_files; // Maximum number of files to fetch during component matching #endif diff --git a/inc/parse.h b/inc/parse.h index 27e3984..1a367dc 100644 --- a/inc/parse.h +++ b/inc/parse.h @@ -4,6 +4,7 @@ #include #include #include "scanoss.h" +#include "component.h" void extract_csv(char *out, char *in, int n, long limit); void lowercase(char *word); diff --git a/inc/scanoss.h b/inc/scanoss.h index e32c313..16de422 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -30,12 +30,6 @@ #include #include "limits.h" -#define MAX_FILE_PATH 1024 -#define FETCH_MAX_FILES 12000 -#define MIN_FILE_SIZE 256 // files below this size will be ignored -#define CRC_LIST_LEN 1024 // list of crc checksums to avoid metadata duplicates -#define SNIPPET_LINE_TOLERANCE 10 - #define WFP_LN 4 #define WFP_REC_LN 18 @@ -84,37 +78,6 @@ extern const char *dependency_sources[]; typedef enum {MATCH_NONE, MATCH_FILE, MATCH_SNIPPET, MATCH_BINARY} match_t; -typedef struct keywords -{ - int count; - char word[MAX_FIELD_LN]; -} keywords; - - -typedef struct file_recordset -{ - uint8_t url_id[MD5_LEN]; - char path[MAX_FILE_PATH]; - int path_ln; - bool external; -} file_recordset; - -typedef struct len_rank -{ - int id; - int len; -} len_rank; - -typedef struct component_item -{ - char * vendor; - char * component; - char * purl; - char * version; - char * license; -} component_item; - - extern long microseconds_start; extern int map_rec_len; extern bool match_extensions; @@ -144,9 +107,6 @@ extern bool first_file; extern int max_vulnerabilities; extern char *ignored_assets; -extern component_item *ignore_components; -extern component_item *declared_components; - /* Prototype declarations */ diff --git a/src/file.c b/src/file.c index 204915e..d56f9fe 100644 --- a/src/file.c +++ b/src/file.c @@ -192,8 +192,8 @@ int dir_count(char *path) bool collect_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { - /* Leave if FETCH_MAX_FILES is reached */ - if (iteration >= FETCH_MAX_FILES) return true; + /* Leave if fetch_max_files is reached */ + if (iteration >= fetch_max_files) return true; /* Ignore path lengths over the limit */ if (!datalen || datalen >= (MD5_LEN + MAX_FILE_PATH)) return false; @@ -231,7 +231,7 @@ bool count_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_ int * count = ptr; *count = iteration; - if (iteration >= FETCH_MAX_FILES) + if (iteration >= fetch_max_files) { return true; } diff --git a/src/help.c b/src/help.c index a42dca1..baa9331 100644 --- a/src/help.c +++ b/src/help.c @@ -55,9 +55,12 @@ Configuration:\n\ -N, --max-components NUM Set maximum number of components (default: %d).\n\ -T, --tolerance NUM Set snippet scanning tolerance percentage (default: 0.1).\n\ -r, --rank NUM Set maximum component rank accepted (default: %d).\n\ + --max-files NUM Set maximum number of files to fetch during matching (default: 12000).\n\ + --min-match-hits NUM Set minimum snippet ID hits for a match (default: 4).\n\ + --min-match-lines NUM Set minimum matched lines for a range (default: 10).\n\ -s, --sbom FILE Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ -b, --blacklist FILE Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ - --force-snippet FILE Force snippet scanning even for full file matches.\n\ + --force-snippet FILE Same as \"-b\" but with forced snippet scanning.\n\ -a, --attribution FILE Show attribution notices for the provided SBOM.json file.\n\ -c, --component HINT Add a component HINT to guide scan results.\n\ -k, --key KEY Show contents of the specified KEY file from MZ sources archive.\n\ diff --git a/src/limits.c b/src/limits.c index 8471794..4f9772b 100644 --- a/src/limits.c +++ b/src/limits.c @@ -12,5 +12,6 @@ int range_tolerance = 5; /** A maximum number of non-matched lines tolerated inside a matching range */ int min_match_lines = 10; /** Minimum number of lines matched for a match range to be acepted */ int min_match_hits = 4; /** Minimum number of snippet ID hits to produce a snippet match*/ +int fetch_max_files = 12000; /** Maximum number of files to fetch during component matching */ const int max_vulnerabilities = 50; /** Show only the first N vulnerabilities */ \ No newline at end of file diff --git a/src/main.c b/src/main.c index b88bf90..32ccbb8 100644 --- a/src/main.c +++ b/src/main.c @@ -281,6 +281,9 @@ static struct option long_options[] = { {"name", required_argument, 0, 'n'}, {"max-snippets", required_argument, 0, 'M'}, {"max-components", required_argument, 0, 'N'}, + {"max-files", required_argument, 0, 257}, /* Long option only */ + {"min-match-hits", required_argument, 0, 258}, /* Long option only */ + {"min-match-lines", required_argument, 0, 259}, /* Long option only */ {"wfp", no_argument, 0, 'w'}, {"test", no_argument, 0, 't'}, {"version", no_argument, 0, 'v'}, @@ -443,7 +446,19 @@ int main(int argc, char **argv) printf("Unsupported option: %c\n", optopt); invalid_argument = true; break; - + + case 257: /* --max-files */ + fetch_max_files = atoi(optarg); + break; + + case 258: /* --min-match-hits */ + min_match_hits = atoi(optarg); + break; + + case 259: /* --min-match-lines */ + min_match_lines = atoi(optarg); + break; + case 'H': if (hpsm_lib_load()) hpsm_enabled = true; diff --git a/src/match.c b/src/match.c index 6b0770f..9eb6b49 100644 --- a/src/match.c +++ b/src/match.c @@ -539,12 +539,12 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, * @return false */ /*Iterations must be doubled if high accuracy is enabled*/ -int iteration_max = FETCH_MAX_FILES; +int iteration_max = DEFAULT_MATCHMAP_FILES; bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { /*Iterations must be doubled if high accuracy is enabled*/ if (iteration == 0) - iteration_max = ((engine_flags & ENABLE_HIGH_ACCURACY) ? FETCH_MAX_FILES * 4 : FETCH_MAX_FILES); + iteration_max = ((engine_flags & ENABLE_HIGH_ACCURACY) ? fetch_max_files * 4 : fetch_max_files); /*Return we high accuracy it is not enabled*/ if (iteration > iteration_max) @@ -588,10 +588,8 @@ bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * */ bool load_matches(match_data_t *match) { - scanlog("Load matches\n"); + scanlog("Loading matches - fetch_max_files: %d\n", fetch_max_files); - - if (match->type == MATCH_BINARY) { asprintf(&match->line_ranges, "n/a"); diff --git a/src/versions.c b/src/versions.c index e6f4526..f97ec2c 100644 --- a/src/versions.c +++ b/src/versions.c @@ -45,13 +45,13 @@ #include "versions.h" -static char * purl_indirection_reference[FETCH_MAX_FILES]; +static char * purl_indirection_reference[FETCH_MAX_FILES_DEFAULT]; static int purl_indirection_index = 0; -static release_version * purl_version_list[FETCH_MAX_FILES]; +static release_version * purl_version_list[FETCH_MAX_FILES_DEFAULT]; void purl_latest_version_add(component_data_t * component) { - if (!component->purls[0] || !component->release_date || !component->version || purl_indirection_index == FETCH_MAX_FILES) + if (!component->purls[0] || !component->release_date || !component->version || purl_indirection_index == fetch_max_files) return; for (int i = 0; i < purl_indirection_index; i++) From f2b6e68b494a8f82f129242c7f1731a63681f973 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Thu, 4 Dec 2025 09:55:28 +0000 Subject: [PATCH 04/20] refactor: best component selection logic. Improve rank processing --- src/license.c | 10 +- src/match.c | 307 ++++++++++++++++++++++++++++++++--------------- src/match_list.c | 6 +- src/url.c | 4 +- src/util.c | 22 ++-- 5 files changed, 238 insertions(+), 111 deletions(-) diff --git a/src/license.c b/src/license.c index 13a8592..3c8a01f 100644 --- a/src/license.c +++ b/src/license.c @@ -68,7 +68,15 @@ bool license_add_to_list(struct license_list * ptr, char * license) ptr->licenses = realloc(ptr->licenses, sizeof(char *) * (ptr->count + 1)); if (!ptr->licenses) return false; - ptr->licenses[ptr->count] = strdup(license); + + /* Allocate with extra padding for CRC32C hardware reads (8-byte blocks) */ + size_t len = strlen(license); + size_t padded_len = ((len + 8) / 8) * 8; /* Round up to next 8-byte boundary */ + ptr->licenses[ptr->count] = calloc(1, padded_len); + if (!ptr->licenses[ptr->count]) + return false; + strcpy(ptr->licenses[ptr->count], license); + ptr->count++; return true; } diff --git a/src/match.c b/src/match.c index 9eb6b49..23d52ce 100644 --- a/src/match.c +++ b/src/match.c @@ -252,9 +252,75 @@ static void evaluate_path_rank(component_data_t *comp) } } +/** + * @brief Initialize component age by computing MD5 of purl and fetching age from database + * @param comp Component to initialize + */ +static inline void initialize_component_age(component_data_t *comp) +{ + if (!comp->purls_md5[0] && comp->purls[0]) + { + comp->purls_md5[0] = malloc(MD5_LEN); + MD5((uint8_t *)comp->purls[0], strlen(comp->purls[0]), comp->purls_md5[0]); + comp->age = get_component_age(comp->purls_md5[0]); + } +} + +/** + * @brief Compare two integer values and return comparison result + * @param val_a Value from component a + * @param val_b Value from component b + * @param prefer_higher If true, higher value wins; if false, lower value wins + * @return 1 if b wins, -1 if a wins, 0 if tie + */ +static inline int compare_int_values(int val_a, int val_b, bool prefer_higher) +{ + if (val_a == val_b) + return 0; + + if (prefer_higher) + return (val_b > val_a) ? 1 : -1; + else + return (val_b < val_a) ? 1 : -1; +} + +int compare_file_extension(component_data_t *a, component_data_t *b) +{ + if (!a->file_path_ref) + return 0; + + char *ext_file = extension(a->file_path_ref); + if (!ext_file) + return 0; + + char *ext_a = extension(a->file); + char *ext_b = extension(b->file); + + if (!ext_a && ext_b) + return 1; + + if (ext_a && !ext_b) + return -1; + + if (!ext_a && !ext_b) + return 0; + + int result_a = strcmp(ext_a, ext_file); + int result_b = strcmp(ext_b, ext_file); + + if (result_a == result_b) + return 0; + else if (!result_a) + return -1; + else if (!result_b) + return 1; + + return 0; +} + /** * @brief Funtion to be called as pointer when a new compoent has to be loaded in to the list - * + * * @param a existent component in the list * @param b new component to be added * @return true b has to be included in the list before "a" @@ -263,61 +329,61 @@ static void evaluate_path_rank(component_data_t *comp) static bool component_hint_date_comparation(component_data_t *a, component_data_t *b) { + // 1. Declared components (SBOM) evaluation if (declared_components) { scanlog("ASSETS eval- %d / %d\n", a->identified, b->identified); - if (a->identified > b->identified) - { - scanlog("Reject component %s@%s by SBOM\n", b->purls[0], b->version); - return false; - } - - if (b->identified > a->identified) + if (a->identified != b->identified) { + if (a->identified > b->identified) + { + scanlog("Reject component %s@%s by SBOM\n", b->purls[0], b->version); + return false; + } scanlog("Accept component %s@%s by SBOM\n", b->purls[0], b->version); return true; } } - + // 2. Component hint evaluation else if (component_hint) { scanlog("hint eval\n"); - int result = hint_eval(a,b); - if (result > 0) - return true; - if (result < 0) - return false; + int hint_result = hint_eval(a,b); + if (hint_result != 0) + return hint_result > 0; } + // 3. Path rank hint evaluation if ((engine_flags & ENABLE_PATH_HINT) && a->file_path_ref && b->file_path_ref) { - //evalute path rank for component a evaluate_path_rank(a); - - //evalute path rank for component b evaluate_path_rank(b); - //The path_rank will be used as hint only when it has a reasonable value, in other cases the critea will be ignored. - if (b->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) + const int rank_threshold = PATH_LEVEL_COMP_REF / 3 + 1; + + // Path rank is used as hint only when it has a reasonable value + if (b->path_rank < rank_threshold) { - if (b->path_rank - a->path_rank < 0) + int rank_diff = b->path_rank - a->path_rank; + if (rank_diff < 0) { scanlog("%s wins %s by path rank %d\n", b->purls[0], a->purls[0], b->path_rank); return true; } - if (b->path_rank - a->path_rank > 0) + if (rank_diff > 0) { scanlog("%s - %s loses %s by path rank %d/%d\n", b->purls[0],b->file, a->purls[0], b->path_rank, a->path_rank); return false; } } - else if (a->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) + else if (a->path_rank < rank_threshold) { scanlog("%s rejected, %s wins by path rank %d\n", b->purls[0], a->purls[0], a->path_rank); return false; } } + // 4. Release date validation if (!*b->release_date) { scanlog("%s rejected due to empty release date\n", b->purls[0]); @@ -329,18 +395,31 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return true; } - // Third-party path evaluation + int file_extension_comp = compare_file_extension(a, b); + if (file_extension_comp < 0) + { + scanlog("%s rejected by file extension match\n", b->purls[0]); + return false; + } + else if (file_extension_comp > 0) + { + scanlog("%s accepted by file extension mismatch\n", b->purls[0]); + return true; + } + + // 5. Third-party path evaluation int tp_a = path_is_third_party(a); int tp_b = path_is_third_party(b); + int tp_diff = tp_a - tp_b; - if (tp_a - tp_b > 4) + if (tp_diff > 6) { scanlog("Component rejected by third party path filter (%s=%d=%s > %s=%d=%s)\n", a->purls[0], tp_a,a->file, b->purls[0], tp_b, b->file); return false; } - else if (tp_b - tp_a > 4) + if (tp_diff < - 6) { - scanlog("Component accepted by third party path filter (%s=%d < %s=%d)\n", a->purls[0], tp_a, b->purls[0], tp_b); + scanlog("Component accepted by third party path filter (%s=%d=%s < %s=%d=%s)\n", a->purls[0], tp_a, a->file, b->purls[0], tp_b, b->file); return true; } @@ -349,16 +428,34 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ { bool good_purl_a = binary_file_to_purl(a); bool good_purl_b = binary_file_to_purl(b); - if (good_purl_b && !good_purl_a) - { - scanlog("Component %s prefered over %s by binary purl match\n", b->purls[0], a->purls[0]); - return true; - } - else if (good_purl_a && !good_purl_b) + + if (good_purl_b != good_purl_a) { + if (good_purl_b) + { + scanlog("Component %s prefered over %s by binary purl match\n", b->purls[0], a->purls[0]); + return true; + } scanlog("Component %s rejected by binary purl match\n", b->purls[0]); return false; } + else if (good_purl_b && good_purl_a) + { + // 7.3. Vendor component check + bool vendor_check_a = purl_vendor_component_check(a); + bool vendor_check_b = purl_vendor_component_check(b); + + if (vendor_check_a != vendor_check_b) + { + if (vendor_check_b) + { + scanlog("Component %s prefered over %s by vendor+component=purl\n", b->purls[0], a->purls[0]); + return true; + } + scanlog("Component %s rejected, %s wins by vendor+component=purl\n", b->purls[0], a->purls[0]); + return false; + } + } if (b->rank >= COMPONENT_RANK_SELECTION_MAX && a->rank < COMPONENT_RANK_SELECTION_MAX) { @@ -366,126 +463,144 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return false; } - //lower rank selection logic + // Lower rank selection logic if (b->rank <= COMPONENT_RANK_SELECTION_MAX) { - scanlog("path lenght: %s - %d vs %s - %d\n", b->file, b->path_depth, a->file, a->path_depth); - //shorter path lenght are prefered - if (b->path_depth < a->path_depth/2) - { - scanlog("%s accepted by shorter path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); - return true; - } - else if (a->path_depth < b->path_depth/2) - { - scanlog("%s rejected by longer path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); - return false; - } + bool same_component = !strcmp(a->component, b->component); - if(b->path_depth > a->path_depth+1) + // If both components are the same, the best ranked purl must win + if (same_component && b->rank != a->rank) { - scanlog("%s rejected by deeper path in rank selection %d > %d\n", b->purls[0], b->path_depth, a->path_depth); + if (b->rank < a->rank) + { + scanlog("%s wins %s by rank %d/%d\n", b->purls[0], a->purls[0], b->rank, a->rank); + return true; + } + scanlog("%s rejected by rank %d\n", b->purls[0], b->rank); return false; } - if (b->rank < a->rank) - { - scanlog("%s wins %s by rank %d/%d\n", b->purls[0], a->purls[0], b->rank, a->rank); - return true; + // Shorter path lengths are preferred for rank difference not so big + if (abs(b->rank - a->rank) < 5) + { + scanlog("path lenght: %s - %d vs %s - %d\n", b->file, b->path_depth, a->file, a->path_depth); + if (b->path_depth + 2 < a->path_depth/2) + { + scanlog("%s accepted by shorter path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); + return true; + } + if (a->path_depth + 2 < b->path_depth/2) + { + scanlog("%s rejected by longer path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); + return false; + } + + if(b->path_depth > a->path_depth+1) + { + scanlog("%s rejected by deeper path in rank selection %d > %d\n", b->purls[0], b->path_depth, a->path_depth); + return false; + } } - else if (b->rank > a->rank) + if (b->rank != a->rank) { + if (b->rank < a->rank) + { + scanlog("%s wins %s by rank %d/%d\n", b->purls[0], a->purls[0], b->rank, a->rank); + return true; + } scanlog("%s rejected by rank %d\n", b->purls[0], b->rank); return false; } } } - /*if the relese date is the same untie with the component age (purl)*/ + // 7. If release dates are equal, use tiebreakers if (!strcmp(b->release_date, a->release_date)) - { - if (purl_source_check(a) > purl_source_check(b)) + { + // 7.1. Source check + int source_a = purl_source_check(a); + int source_b = purl_source_check(b); + int source_cmp = compare_int_values(source_a, source_b, false); + + if (source_cmp > 0) { scanlog("%s accepted over %s by source check\n", b->purls[0], a->purls[0]); return true; } - else if (purl_source_check(b) > purl_source_check(a)) + if (source_cmp < 0) { scanlog("%s rejected by source check\n", b->purls[0]); return false; } - //Look for available health information + // 7.2. Health information print_health(a); print_health(b); - int health_a = a->health_stats[0] + a->health_stats[2]; //add forks and watchers + int health_a = a->health_stats[0] + a->health_stats[2]; // forks + watchers int health_b = b->health_stats[0] + b->health_stats[2]; + int health_cmp = compare_int_values(health_a, health_b, true); - - if (health_b > health_a) + if (health_cmp > 0) { scanlog("Component prefered by health: %s = %d vs %s = %d\n", b->purls[0], health_b, a->purls[0], health_a); return true; } - else if (health_a > health_b) - { + if (health_cmp < 0) return false; - } - - if (!purl_vendor_component_check(a) && purl_vendor_component_check(b)) - { - scanlog("Component %s prefered over %s by vendor+component=purl\n", b->purls[0], a->purls[0]); - return true; - } - else if (purl_vendor_component_check(a) && !purl_vendor_component_check(b)) + // 7.3. Vendor component check + bool vendor_check_a = purl_vendor_component_check(a); + bool vendor_check_b = purl_vendor_component_check(b); + + if (vendor_check_a != vendor_check_b) { + if (vendor_check_b) + { + scanlog("Component %s prefered over %s by vendor+component=purl\n", b->purls[0], a->purls[0]); + return true; + } scanlog("Component %s rejected, %s wins by vendor+component=purl\n", b->purls[0], a->purls[0]); return false; } - if (!a->purls_md5[0] && a->purls[0]) - { - a->purls_md5[0] = malloc(MD5_LEN); - MD5((uint8_t *)a->purls[0], strlen(a->purls[0]), a->purls_md5[0]); - a->age = get_component_age(a->purls_md5[0]); - } - - if (!b->purls_md5[0] && b->purls[0]) - { - b->purls_md5[0] = malloc(MD5_LEN); - MD5((uint8_t *)b->purls[0], strlen(b->purls[0]), b->purls_md5[0]); - b->age = get_component_age(b->purls_md5[0]); - } - + // 7.4. Component age (lazy initialization) + initialize_component_age(a); + initialize_component_age(b); + if ((!a->age && b->age) || b->age > a->age) { scanlog("Component %s prefered over %s by purl date (age: %ld vs %ld)\n", b->purls[0], a->purls[0], b->age, a->age); return true; } - else if ((!b->age && a->age) || a->age > b->age) + if ((!b->age && a->age) || a->age > b->age) { scanlog("Component %s rejected by purl date (age: %ld vs %ld)\n", b->purls[0], b->age, a->age); return false; } - if (b->age == a->age && !strcmp(a->component, b->component) && strcmp(a->version, b->version) > 0) - { - scanlog("Component %s prefered over %s by version\n", b->purls[0], a->purls[0]); - return true; - } - else if (b->age == a->age && !strcmp(a->component, b->component) && strcmp(b->version, a->version) > 0) + // 7.5. Version comparison (only if same component and age) + if (b->age == a->age && !strcmp(a->component, b->component)) { - scanlog("Component %s rejected by version comparison\n", b->purls[0]); - return false; + int version_cmp = strcmp(a->version, b->version); + if (version_cmp > 0) + { + scanlog("Component %s prefered over %s by version\n", b->purls[0], a->purls[0]); + return true; + } + if (version_cmp < 0) + { + scanlog("Component %s rejected by version comparison\n", b->purls[0]); + return false; + } } } - /*select the oldest release date */ - if (strcmp(b->release_date, a->release_date) < 0) + // 8. Select the oldest release date + int date_cmp = strcmp(b->release_date, a->release_date); + if (date_cmp < 0) { scanlog("Component %s (rank %d) prefered over %s (rank %d) by release date\n", b->purls[0],b->rank, a->purls[0], a->rank); return true; } - else if (strcmp(b->release_date, a->release_date) > 0) + if (date_cmp > 0) { scanlog("Component %s (rank %d) rejected, %s (rank %d) wins by older release date\n", b->purls[0], b->rank, a->purls[0], a->rank); return false; diff --git a/src/match_list.c b/src/match_list.c index 9df35af..134e429 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -96,12 +96,12 @@ bool component_list_add(component_list_t *list, component_data_t *new_comp, bool { if (list->items >= list->max_items) return false; - + struct comp_entry *nn = calloc(1, sizeof(struct comp_entry)); /* Insert after. */ - nn->component = new_comp; + nn->component = new_comp; LIST_INSERT_AFTER(list->last_element, nn, entries); list->last_element_aux = list->last_element; - list->last_element = nn; + list->last_element = nn; list->items++; return true; } diff --git a/src/url.c b/src/url.c index 027cd88..be23c7a 100644 --- a/src/url.c +++ b/src/url.c @@ -77,7 +77,9 @@ bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra memcpy(new_comp->url_md5, key, LDB_KEY_LN); memcpy(new_comp->url_md5 + LDB_KEY_LN, subkey, subkey_ln); new_comp->url_match = true; - new_comp->file = strdup(new_comp->url); + char * file_name = strdup(new_comp->url); + new_comp->file = strdup(basename(file_name)); + free(file_name); new_comp->file_md5_ref = component_list->match_ref->file_md5; new_comp->identified = IDENTIFIED_NONE; asset_declared(new_comp); diff --git a/src/util.c b/src/util.c index 00b03e7..9967cb3 100644 --- a/src/util.c +++ b/src/util.c @@ -339,8 +339,8 @@ int path_is_third_party(component_data_t *comp) if (!comp->file) return 0; - - char * path = dirname(comp->file); + char * full_path = strdup(comp->file); + char * path = dirname(full_path); const char* patterns[] = { // Explicit third-party naming @@ -360,7 +360,6 @@ int path_is_third_party(component_data_t *comp) // Build/dependency management directories "external", // Maven, CMake external dependencies - "externals", // Alternative "dependencies", // Generic dependency directories "dep", // Short form "packages", // NuGet, Generic (covers packages.lock) @@ -388,26 +387,29 @@ int path_is_third_party(component_data_t *comp) "contrib", // Contributed/third-party code "plugin", // Plugins (often third-party) - "utils","lib", "components", "modules", "ext", - "fixtures", "examples", - "files", "assets", "runtime", + "utils", "common", "components", "modules", "ext", + "fixtures", "examples","assets", "runtime", "subprojects", "managed", "local_packages", "published", - "driver", "libresources", "offloading","documentation", "test" + "libresources", "offloading", "media","lib", "documentation", "test", "service","driver", "files" }; const int numPatterns = sizeof(patterns) / sizeof(patterns[0]); - if (!strcmp(path, comp->file)) + if (!strcmp(path, ".")) + { + free(full_path); return numPatterns; + } for (int i = 0; i < numPatterns; i++) { if (strcasestr(path, patterns[i]) != NULL) { - return i; + free(full_path); + return i; } } - + free(full_path); return numPatterns + 1; } From c7c6698fc7888ac9a5f03291656501d6eaf8be49 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Tue, 9 Dec 2025 12:13:14 +0000 Subject: [PATCH 05/20] third party filter tune up --- src/match.c | 10 ++-------- src/util.c | 16 +++++++--------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/src/match.c b/src/match.c index 23d52ce..821f9b3 100644 --- a/src/match.c +++ b/src/match.c @@ -412,12 +412,12 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ int tp_b = path_is_third_party(b); int tp_diff = tp_a - tp_b; - if (tp_diff > 6) + if (tp_diff > 7) { scanlog("Component rejected by third party path filter (%s=%d=%s > %s=%d=%s)\n", a->purls[0], tp_a,a->file, b->purls[0], tp_b, b->file); return false; } - if (tp_diff < - 6) + if (tp_diff < -7) { scanlog("Component accepted by third party path filter (%s=%d=%s < %s=%d=%s)\n", a->purls[0], tp_a, a->file, b->purls[0], tp_b, b->file); return true; @@ -494,12 +494,6 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ scanlog("%s rejected by longer path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); return false; } - - if(b->path_depth > a->path_depth+1) - { - scanlog("%s rejected by deeper path in rank selection %d > %d\n", b->purls[0], b->path_depth, a->path_depth); - return false; - } } if (b->rank != a->rank) { diff --git a/src/util.c b/src/util.c index 9967cb3..93cdc21 100644 --- a/src/util.c +++ b/src/util.c @@ -361,7 +361,7 @@ int path_is_third_party(component_data_t *comp) // Build/dependency management directories "external", // Maven, CMake external dependencies "dependencies", // Generic dependency directories - "dep", // Short form + "deps", // Short form "packages", // NuGet, Generic (covers packages.lock) // Language-specific package directories @@ -378,19 +378,17 @@ int path_is_third_party(component_data_t *comp) "imported", // Imported code "foreign", // Foreign code - // Build output that may contain third-party - "dist", // Distribution builds - "release", // Release builds - "bundle", // Bundled dependencies - // Contribution/extension directories "contrib", // Contributed/third-party code "plugin", // Plugins (often third-party) - "utils", "common", "components", "modules", "ext", + "utils", "components", "modules", "ext", "fixtures", "examples","assets", "runtime", "subprojects", "managed", "local_packages", "published", - "libresources", "offloading", "media","lib", "documentation", "test", "service","driver", "files" + "libresources", "offloading", "compile", "release", "bundle", + "media", "documentation", "test", + "service","lib","dist", + "driver", "common","files" }; const int numPatterns = sizeof(patterns) / sizeof(patterns[0]); @@ -410,7 +408,7 @@ int path_is_third_party(component_data_t *comp) } } free(full_path); - return numPatterns + 1; + return numPatterns; } /** From 71d354a94c21288d9e1f932ba0eae95c10afc45a Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Tue, 9 Dec 2025 12:28:45 +0000 Subject: [PATCH 06/20] improve component_hint_date_comparation comments --- src/match.c | 112 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 24 deletions(-) diff --git a/src/match.c b/src/match.c index 821f9b3..5b5c877 100644 --- a/src/match.c +++ b/src/match.c @@ -319,32 +319,49 @@ int compare_file_extension(component_data_t *a, component_data_t *b) } /** - * @brief Funtion to be called as pointer when a new compoent has to be loaded in to the list + * @brief Component comparison function for determining insertion order in the component list * - * @param a existent component in the list - * @param b new component to be added - * @return true b has to be included in the list before "a" - * @return false "a" wins, compare with the next component. + * This function implements the component selection logic using multiple hierarchical criteria: + * 1. Declared components (SBOM) evaluation + * 2. Component hints (purl and component name matching) + * 3. Path rank hint evaluation (file path similarity) + * 4. Release date validation + * 5. File extension matching + * 6. Third-party path evaluation + * 7. URL ranking and binary purl matching + * 8. Tiebreakers for equal release dates (source check, health metrics, vendor check, component age, version comparison) + * 9. Final selection based on oldest release date + * + * @param a Existing component in the list to compare against + * @param b New candidate component to be added + * @return true If component 'b' should be inserted before 'a' (b wins) + * @return false If component 'a' wins, continue comparing with the next component */ static bool component_hint_date_comparation(component_data_t *a, component_data_t *b) { // 1. Declared components (SBOM) evaluation + // Prioritize components that are declared in the SBOM (Software Bill of Materials) + // identified > 0 means the component was declared/identified in the SBOM if (declared_components) { scanlog("ASSETS eval- %d / %d\n", a->identified, b->identified); if (a->identified != b->identified) { + // Keep component 'a' if it's identified and 'b' is not if (a->identified > b->identified) { scanlog("Reject component %s@%s by SBOM\n", b->purls[0], b->version); return false; } + // Accept component 'b' if it's identified and 'a' is not scanlog("Accept component %s@%s by SBOM\n", b->purls[0], b->version); return true; } } // 2. Component hint evaluation + // Apply user-provided component hints to influence selection + // Hints can match against purl or component names else if (component_hint) { scanlog("hint eval\n"); @@ -352,8 +369,10 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ if (hint_result != 0) return hint_result > 0; } - + // 3. Path rank hint evaluation + // Compare file path similarity between scanned file and component file paths + // Lower rank means better similarity (more matching path components) if ((engine_flags & ENABLE_PATH_HINT) && a->file_path_ref && b->file_path_ref) { evaluate_path_rank(a); @@ -361,21 +380,25 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ const int rank_threshold = PATH_LEVEL_COMP_REF / 3 + 1; - // Path rank is used as hint only when it has a reasonable value + // Path rank is used as hint only when it has a reasonable value (below threshold) + // This prevents poor matches from being selected based on path alone if (b->path_rank < rank_threshold) { int rank_diff = b->path_rank - a->path_rank; + // Component 'b' has better path similarity than 'a' if (rank_diff < 0) { scanlog("%s wins %s by path rank %d\n", b->purls[0], a->purls[0], b->path_rank); return true; } + // Component 'a' has better path similarity than 'b' if (rank_diff > 0) { scanlog("%s - %s loses %s by path rank %d/%d\n", b->purls[0],b->file, a->purls[0], b->path_rank, a->path_rank); return false; } } + // If only 'a' has a good path rank, keep it else if (a->path_rank < rank_threshold) { scanlog("%s rejected, %s wins by path rank %d\n", b->purls[0], a->purls[0], a->path_rank); @@ -384,6 +407,8 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ } // 4. Release date validation + // Reject components without valid release dates + // Components must have release date information to be considered if (!*b->release_date) { scanlog("%s rejected due to empty release date\n", b->purls[0]); @@ -395,42 +420,55 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return true; } + // 5. File extension matching + // Prefer components where the file extension matches the scanned file int file_extension_comp = compare_file_extension(a, b); if (file_extension_comp < 0) { + // Component 'a' has matching extension, 'b' does not scanlog("%s rejected by file extension match\n", b->purls[0]); return false; } else if (file_extension_comp > 0) { + // Component 'b' has matching extension, 'a' does not scanlog("%s accepted by file extension mismatch\n", b->purls[0]); return true; } - // 5. Third-party path evaluation + // 6. Third-party path evaluation + // Prefer components from third-party directories (vendor, external, 3rdparty, etc.) + // Higher score means more likely to be a third-party component int tp_a = path_is_third_party(a); int tp_b = path_is_third_party(b); int tp_diff = tp_a - tp_b; + // If 'a' is significantly more third-party than 'b' (difference > 7), reject 'b' if (tp_diff > 7) { scanlog("Component rejected by third party path filter (%s=%d=%s > %s=%d=%s)\n", a->purls[0], tp_a,a->file, b->purls[0], tp_b, b->file); return false; } + // If 'b' is significantly more third-party than 'a' (difference < -7), accept 'b' if (tp_diff < -7) { scanlog("Component accepted by third party path filter (%s=%d=%s < %s=%d=%s)\n", a->purls[0], tp_a, a->file, b->purls[0], tp_b, b->file); return true; } - //when the url ranking is enabled + // 7. URL ranking and binary purl matching + // When URL ranking is enabled (rank < COMPONENT_DEFAULT_RANK), use ranking metrics + // Lower rank values indicate higher quality/more authoritative sources if (b->rank < COMPONENT_DEFAULT_RANK || a->rank < COMPONENT_DEFAULT_RANK) - { + { + // 7.1. Binary file to purl matching + // Check if the component's purl matches what would be expected for a binary file bool good_purl_a = binary_file_to_purl(a); bool good_purl_b = binary_file_to_purl(b); if (good_purl_b != good_purl_a) { + // Prefer component with matching binary purl if (good_purl_b) { scanlog("Component %s prefered over %s by binary purl match\n", b->purls[0], a->purls[0]); @@ -439,9 +477,11 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ scanlog("Component %s rejected by binary purl match\n", b->purls[0]); return false; } + // If both have good binary purls, check vendor+component match else if (good_purl_b && good_purl_a) { - // 7.3. Vendor component check + // 7.2. Vendor component check for binary purls + // Verify if vendor and component names align with the purl bool vendor_check_a = purl_vendor_component_check(a); bool vendor_check_b = purl_vendor_component_check(b); @@ -457,18 +497,22 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ } } + // 7.3. Rank threshold check + // Reject components with rank above the maximum selection threshold if (b->rank >= COMPONENT_RANK_SELECTION_MAX && a->rank < COMPONENT_RANK_SELECTION_MAX) { scanlog("%s rejected by rank threshold %d >= %d\n", b->purls[0], b->rank, COMPONENT_RANK_SELECTION_MAX); return false; } - - // Lower rank selection logic + + // 7.4. Lower rank selection logic + // For components with acceptable ranks (below max threshold), apply additional criteria if (b->rank <= COMPONENT_RANK_SELECTION_MAX) { bool same_component = !strcmp(a->component, b->component); - // If both components are the same, the best ranked purl must win + // 7.4.1. Same component comparison - prefer better ranked purl + // When comparing different sources of the same component, rank is decisive if (same_component && b->rank != a->rank) { if (b->rank < a->rank) @@ -480,21 +524,26 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return false; } - // Shorter path lengths are preferred for rank difference not so big + // 7.4.2. Path depth comparison for similar ranks + // When ranks are close (difference < 5), prefer shorter file paths + // Shorter paths often indicate more direct/canonical locations if (abs(b->rank - a->rank) < 5) - { + { scanlog("path lenght: %s - %d vs %s - %d\n", b->file, b->path_depth, a->file, a->path_depth); + // Component 'b' has significantly shorter path (less than half of 'a') if (b->path_depth + 2 < a->path_depth/2) { scanlog("%s accepted by shorter path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); return true; } + // Component 'a' has significantly shorter path (less than half of 'b') if (a->path_depth + 2 < b->path_depth/2) { scanlog("%s rejected by longer path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); return false; } } + // 7.4.3. Final rank comparison if no other criteria applied if (b->rank != a->rank) { if (b->rank < a->rank) @@ -507,10 +556,13 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ } } } - // 7. If release dates are equal, use tiebreakers + // 8. Tiebreakers for equal release dates + // When release dates are identical, use additional criteria to select the best component if (!strcmp(b->release_date, a->release_date)) { - // 7.1. Source check + // 8.1. Source check + // Prefer components from more authoritative sources (official repos, etc.) + // Lower source value is better (prefer_higher = false) int source_a = purl_source_check(a); int source_b = purl_source_check(b); int source_cmp = compare_int_values(source_a, source_b, false); @@ -526,7 +578,9 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return false; } - // 7.2. Health information + // 8.2. Health information + // Prefer components from healthier projects (more forks + watchers) + // Higher health value is better (prefer_higher = true) print_health(a); print_health(b); int health_a = a->health_stats[0] + a->health_stats[2]; // forks + watchers @@ -541,7 +595,8 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ if (health_cmp < 0) return false; - // 7.3. Vendor component check + // 8.3. Vendor component check + // Verify if vendor and component names align with the purl bool vendor_check_a = purl_vendor_component_check(a); bool vendor_check_b = purl_vendor_component_check(b); @@ -555,8 +610,10 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ scanlog("Component %s rejected, %s wins by vendor+component=purl\n", b->purls[0], a->purls[0]); return false; } - - // 7.4. Component age (lazy initialization) + + // 8.4. Component age (lazy initialization) + // Prefer older components (first appearance in package repositories) + // Higher age value means the component was published earlier initialize_component_age(a); initialize_component_age(b); @@ -571,7 +628,9 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return false; } - // 7.5. Version comparison (only if same component and age) + // 8.5. Version comparison (only if same component and age) + // For the same component with same age, use lexicographic version comparison + // Lower version string is preferred (usually represents older/more stable versions) if (b->age == a->age && !strcmp(a->component, b->component)) { int version_cmp = strcmp(a->version, b->version); @@ -587,19 +646,24 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ } } } - // 8. Select the oldest release date + // 9. Final decision: Select the oldest release date + // When no other criteria has decided, prefer the component with the earlier release date + // This implements the fundamental principle of preferring older, more established versions int date_cmp = strcmp(b->release_date, a->release_date); if (date_cmp < 0) { + // Component 'b' has an earlier release date (date_cmp < 0 means b->release_date < a->release_date) scanlog("Component %s (rank %d) prefered over %s (rank %d) by release date\n", b->purls[0],b->rank, a->purls[0], a->rank); return true; } if (date_cmp > 0) { + // Component 'a' has an earlier release date scanlog("Component %s (rank %d) rejected, %s (rank %d) wins by older release date\n", b->purls[0], b->rank, a->purls[0], a->rank); return false; } + // No criteria matched or all criteria were equal - reject component 'b' scanlog("Component %s rejected, no criteria matched\n", b->purls[0]); return false; } From 8aab5f656f99101ea446c93fdabad261d7e4684a Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Wed, 10 Dec 2025 10:57:52 +0000 Subject: [PATCH 07/20] refactor: update license report by SP-3766 --- inc/limits.h | 7 ---- src/license.c | 91 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 57 insertions(+), 41 deletions(-) diff --git a/inc/limits.h b/inc/limits.h index ddb8c7c..35a24f5 100644 --- a/inc/limits.h +++ b/inc/limits.h @@ -49,13 +49,6 @@ /* Variables */ -/* During snippet scanning, when a wfp (with more than consecutive_threshold wfps) produces a score higher - than consecutive_score by consecutive_hits in a row, the scan will skip consecutive_jump snippets */ -extern int consecutive_score; -extern int consecutive_hits; -extern int consecutive_jump; -extern int consecutive_threshold; - extern int range_tolerance; // A maximum number of non-matched lines tolerated inside a matching range extern int min_match_lines; // Minimum number of lines matched for a match range to be acepted extern int min_match_hits; // Minimum number of snippet ID hits to produce a snippet match diff --git a/src/license.c b/src/license.c index 3c8a01f..74e6621 100644 --- a/src/license.c +++ b/src/license.c @@ -43,17 +43,8 @@ #include "file.h" #include "query.h" -/** @brief License sources - 0 = Declared in component - 1 = Declared in file with SPDX-License-Identifier - 2 = Detected in header - 3 = Declared in LICENSE file - 4 = Scancode detection - 5 = Scancode detection at mining time - 6 = osslot */ -const char *license_sources[] = {"component_declared", "file_spdx_tag", "file_header", "license_file", "scancode-file", "scancode", "osselot"}; -bool full_license_report = false; +bool full_license_report = false; struct license_list { @@ -61,6 +52,45 @@ struct license_list int count; }; +//convert license id to license report name +static char * license_id_to_source_name(int id) +{ + switch (id) + { + case 0: + case 35: + return "component_declared"; + case 1: + return "file_spdx_tag"; + case 2: + return "file_header"; + case 3: + case 31: + return "license_file"; + case 4: + return "scancode_file"; + case 5: + return "scancode"; + case 6: + return "component_declared"; + case 7: + case 9: + return "underlying_component"; + case 71: + case 72: + case 73: + case 74: + return "underlying_license_file"; + case 8: + return "scancode"; + + case 10: + return "osselot"; + default: + return NULL; + } +} + bool license_add_to_list(struct license_list * ptr, char * license) { if (!ptr || !license || strlen(license) < 2) @@ -230,6 +260,13 @@ static char *json_from_license(uint32_t *crclist, char *buffer, char *license, i if (!*license || strlen(license) < 2) return buffer; + + char * license_source_id = license_id_to_source_name(src); + if (!license_source_id) + return buffer; + //skip scancode licenses starting with "license-ref" + if (!strncmp(license_source_id, "scancode", 8) && !strncmp(license, "license-ref", 11)) + return buffer; /* Calculate CRC to avoid duplicates */ uint32_t CRC = string_crc32c(license); @@ -249,7 +286,7 @@ static char *json_from_license(uint32_t *crclist, char *buffer, char *license, i len += sprintf(buffer + len, "{"); len += sprintf(buffer + len, "\"name\": \"%s\",", license); len += osadl_print_license(buffer + len, license, true); - len += sprintf(buffer + len, "\"source\": \"%s\"", license_sources[src]); + len += sprintf(buffer + len, "\"source\": \"%s\"", license_source_id); if (!strstr(license, "LicenseRef")) len += sprintf(buffer + len, ",\"url\": \"https://spdx.org/licenses/%s.html\"", license); len += sprintf(buffer + len, "}"); @@ -283,7 +320,7 @@ static char *split_in_json_array(uint32_t *crclist, char *buffer, char *license, return r; // Return the updated buffer pointer, not the original } -char * license_to_json(uint32_t *crclist, char *buffer, char *license, int src, bool *first_record) +char * license_to_json(uint32_t *crclist, char *buffer, char *license, int src, bool *first_record) { if (!strchr(license, '/')) return json_from_license(crclist, buffer, license, src, first_record); @@ -346,8 +383,7 @@ bool print_licenses_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * int src = atoi(source); scanlog("Fetched License %s - source ID %d\n", license, src); - if (src < (sizeof(license_sources) / sizeof(license_sources[0]))) - license_add_to_list(&licenses[src], license); + license_add_to_list(licenses, license); free(source); free(license); @@ -377,14 +413,12 @@ void print_licenses(component_data_t *comp) uint32_t records = 0; comp->license_text = NULL; - int license_types = sizeof(license_sources) / sizeof(license_sources[0]); - struct license_list licenses_by_type[license_types]; - memset(licenses_by_type, 0, sizeof(licenses_by_type)); + struct license_list licenses_by_type = {.count = 0, .licenses = NULL}; /* Print URL license */ if (comp->license && strlen(comp->license) > 2) { - license_add_to_list(&licenses_by_type[0], comp->license); + license_add_to_list(&licenses_by_type, comp->license); scanlog("License present in URL table"); } else @@ -412,7 +446,7 @@ void print_licenses(component_data_t *comp) //Look if someone of the prefered liceses ids already has a match for (int i = 0; i < 4; i++) { - if (licenses_by_type[i].count > 0) + if (licenses_by_type.count > 0) { scanlog("Stop searching for licenses\n"); break; @@ -448,26 +482,15 @@ void print_licenses(component_data_t *comp) buffer = result + len; bool first = true; - for (int i = 0; i < license_types; i++) + for (int i = 0; i < licenses_by_type.count; i++) { - if (licenses_by_type[i].count > 0) - { - if (i > 3 && !first && !full_license_report) - break; - for (int j = 0; j < licenses_by_type[i].count; j++) - { - buffer = license_to_json(crclist, buffer, licenses_by_type[i].licenses[j], i, &first); - } - } + buffer = license_to_json(crclist, buffer, licenses_by_type.licenses[i], i, &first); } len = buffer - result; len += sprintf(result + len, "]"); comp->license_text = result; - /* Free all license lists */ - for (int i = 0; i < license_types; i++) - { - license_free_list(&licenses_by_type[i]); - } + license_free_list(&licenses_by_type); + } From ac624a53e0659258e66ebd94ef70e832fe327316 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Sat, 27 Dec 2025 21:50:52 +0000 Subject: [PATCH 08/20] add logs, fix url rank bug --- src/component.c | 6 +++--- src/license.c | 2 +- src/main.c | 4 +++- src/scan.c | 4 ---- src/url.c | 8 ++++++-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/component.c b/src/component.c index 8906313..8b4bc02 100644 --- a/src/component.c +++ b/src/component.c @@ -253,7 +253,7 @@ bool fill_component(component_data_t *component, uint8_t *url_key, char *file_pa extract_csv(license, (char *)url_record, 5, sizeof(license)); extract_csv(purl, (char *)url_record, 6, sizeof(purl)); extract_csv(url, (char *)url_record, 7, sizeof(url)); - extract_csv(rank, (char *)url_record, 13, sizeof(rank)); //extracts the rank field if available + extract_csv(rank, (char *)url_record, 14, sizeof(rank)); //extracts the rank field if available /* Fill url stats if these are available*/ for (int i = 0; i < 5; i++) { char stat[16] = "\0"; @@ -292,10 +292,10 @@ bool fill_component(component_data_t *component, uint8_t *url_key, char *file_pa MD5((uint8_t *)component->purls[0], strlen(component->purls[0]), component->purls_md5[0]); } component->age = -1; - if (*rank && strlen(rank) < 3) + if (*rank) { component->rank = atoi(rank); - //scanlog("Component rank from DB: %d\n", component->rank); + //scanlog("Component rank from DB: %s- %d\n", rank, component->rank); } else component->rank = COMPONENT_DEFAULT_RANK; diff --git a/src/license.c b/src/license.c index 74e6621..a2dc600 100644 --- a/src/license.c +++ b/src/license.c @@ -265,7 +265,7 @@ static char *json_from_license(uint32_t *crclist, char *buffer, char *license, i if (!license_source_id) return buffer; //skip scancode licenses starting with "license-ref" - if (!strncmp(license_source_id, "scancode", 8) && !strncmp(license, "license-ref", 11)) + if (!strncmp(license_source_id, "scancode", 8) && !strncmp(license, "LicenseRef", 10)) return buffer; /* Calculate CRC to avoid duplicates */ uint32_t CRC = string_crc32c(license); diff --git a/src/main.c b/src/main.c index 32ccbb8..7ab8253 100644 --- a/src/main.c +++ b/src/main.c @@ -326,6 +326,7 @@ int main(int argc, char **argv) int option_index = 0; bool invalid_argument = false; char * ldb_db_name = NULL; + while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:wtLvhedqH", long_options, &option_index)) != -1) { /* Check valid alpha is entered */ @@ -355,6 +356,7 @@ int main(int argc, char **argv) break; case 'r': component_rank_max = atoi(optarg); + scanlog("Max component rank set to %d\n", component_rank_max); break; case 'k': @@ -434,7 +436,7 @@ int main(int argc, char **argv) case 'd': engine_flags = engine_flags_cmd_line; debug_on = true; - scanlog(""); // Log time stamp + scanlog_init(); break; case ':': diff --git a/src/scan.c b/src/scan.c index 84401fa..0123ab0 100644 --- a/src/scan.c +++ b/src/scan.c @@ -449,10 +449,6 @@ void ldb_scan(scan_data_t *scan) exit(EXIT_FAILURE); } - // Clean up the log file - if (debug_on) - scanlog_init(); - scan->matchmap_size = 0; scan->match_type = MATCH_NONE; scan->timer = microseconds_now(); diff --git a/src/url.c b/src/url.c index be23c7a..19fa9e4 100644 --- a/src/url.c +++ b/src/url.c @@ -86,10 +86,11 @@ bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra component_list_add(component_list, new_comp, component_date_comparation, true); } else + { + scanlog("ignoring component with rank %d\n", new_comp->rank); component_data_free(new_comp); - + } free(data); - return false; } @@ -330,6 +331,8 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, if (!url) return false; + //scanlog("url: %s\n", url); + /* Get oldest */ component_data_t **comp_address = ptr; component_data_t * comp_oldest = *comp_address; @@ -340,6 +343,7 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, bool result = fill_component(comp, key, NULL, (uint8_t *)url); if (!result || comp->rank > component_rank_max) { + scanlog("ignoring component with rank %d\n", comp->rank); free(url); component_data_free(comp); return false; From 124bc491204abb522fce0a5f7f59fe84f58eb89c Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Tue, 30 Dec 2025 10:46:15 +0000 Subject: [PATCH 09/20] update scan tunning parameters to meet the latest api definition --- README.md | 87 ++++++++++++++++++++++++++++++++++++----- inc/limits.h | 12 +++--- inc/scan.h | 12 ++++-- inc/scanoss.h | 2 - src/binary_scan.c | 2 +- src/debug.c | 2 +- src/help.c | 6 +-- src/limits.c | 5 --- src/main.c | 50 ++++++++++++++--------- src/report.c | 9 +++++ src/scan.c | 24 +++++++++--- src/snippet_selection.c | 22 +++++------ src/snippets.c | 37 +++++++----------- src/url.c | 16 ++++---- src/vulnerability.c | 1 + 15 files changed, 191 insertions(+), 96 deletions(-) diff --git a/README.md b/README.md index 9868666..f8ff79d 100644 --- a/README.md +++ b/README.md @@ -43,16 +43,83 @@ You can create your own knowledgebase with the minr command, available at https: Syntax: scanoss [parameters] [TARGET] -Configuration: -* -w Treats TARGET as a .wfp file regardless of the actual file extension -* -s FILE Use assets specified in the provided JSON SBOM (CycloneDX/SPDX2.2 JSON format) as input to identification -* -b FILE Ignore matches to assets specified in the provided JSON SBOM (CycloneDX/SPDX2.2 JSON format) - -Options: -* -t Tests engine performance -* -v Display version and exit -* -h Display this help and exit -* -d Enable debugging information +## Configuration Options + +### Basic Configuration +* `-w, --wfp` - Process TARGET as a .wfp file, regardless of its actual extension +* `-H, --hpsm` - Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system) +* `-M, --max-snippets NUM` - Search for up to NUM different components in each file (maximum: 9) +* `-N, --max-components NUM` - Set maximum number of components (default: 5) +* `-T, --tolerance NUM` - Set snippet scanning tolerance percentage (default: 0.1) +* `-r, --rank NUM` - Set maximum component rank accepted (default: 11) +* `--max-files NUM` - Set maximum number of files to fetch during matching (default: 12000) +* `--min-match-hits NUM` - Set minimum snippet ID hits for a match (default: 3, disables auto-adjust) +* `--min-match-lines NUM` - Set minimum matched lines for a range (default: 10, disables auto-adjust) +* `--ignore-file-ext` - Ignore file extension during snippet matching (default: honor extension) + +### SBOM and Filtering +* `-s, --sbom FILE` - Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification +* `-b, --blacklist FILE` - Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format) +* `--force-snippet` - Same as "-b" but with forced snippet scanning +* `-c, --component HINT` - Add a component HINT to guide scan results + +### Attribution and Licenses +* `-a, --attribution FILE` - Show attribution notices for the provided SBOM.json file +* `-k, --key KEY` - Show contents of the specified KEY file from MZ sources archive +* `-l, --license LICENSE` - Display OSADL metadata for the given SPDX license ID +* `-L, --full-license` - Enable full license report +* `-F, --flags FLAGS` - Set engine scanning flags (see Engine Flags section below) + +### General Options +* `-t, --test` - Run engine performance tests +* `-v, --version` - Show version information and exit +* `-n, --name NAME` - Set database name (default: oss) +* `-h, --help` - Display help information and exit +* `-d, --debug` - Store debugging information to disk (/tmp) +* `-q, --quiet` - Suppress JSON output (show only debugging info via STDERR) + +## Environment Variables + +* `SCANOSS_MATCHMAP_MAX` - Set the snippet scanning match map size (default: 10000) +* `SCANOSS_FILE_CONTENTS_URL` - Define the API URL endpoint for sources. Source URL won't be reported if not defined + +## Engine Scanning Flags + +Configure the scanning engine using flags with the `-F/--flags` parameter. These settings can also be specified in `/etc/scanoss_flags.cfg` + +| Flag | Setting | +|-------|-------------------------------------------------------| +| 1 | Disable snippet matching (default: enabled) | +| 2 | Enable snippet_ids (default: disabled) | +| 4 | Disable dependencies (default: enabled) | +| 8 | Disable licenses (default: enabled) | +| 16 | Disable copyrights (default: enabled) | +| 32 | Disable vulnerabilities (default: enabled) | +| 64 | Disable quality (default: enabled) | +| 128 | Disable cryptography (default: enabled) | +| 256 | Disable best match only (default: enabled) | +| 512 | Hide identified files (default: disabled) | +| 1024 | Enable download_url (default: disabled) | +| 2048 | Enable "use path hint" logic (default: disabled) | +| 4096 | Disable extended server stats (default: enabled) | +| 8192 | Disable health layer (default: enabled) | +| 16384 | Enable high accuracy, slower scan (default: disabled) | + +### Examples: +```bash +# Scan DIRECTORY without license and dependency data +scanoss -F 12 DIRECTORY +scanoss --flags 12 DIRECTORY + +# Scan TARGET including SBOM assets +scanoss --sbom my_sbom.json TARGET + +# Scan with custom snippet matching parameters +scanoss --min-match-hits 5 --min-match-lines 15 TARGET + +# Ignore file extensions during matching +scanoss --ignore-file-ext TARGET +``` # File matching logic diff --git a/inc/limits.h b/inc/limits.h index 35a24f5..c4f781d 100644 --- a/inc/limits.h +++ b/inc/limits.h @@ -38,7 +38,6 @@ #define FETCH_MAX_FILES_DEFAULT 12000 #define MIN_FILE_SIZE 256 // files below this size will be ignored #define CRC_LIST_LEN 1024 // list of crc checksums to avoid metadata duplicates -#define SNIPPET_LINE_TOLERANCE 10 /* Snippets */ #define DEFAULT_MATCHMAP_FILES 10000 // Default number of files evaluated in snippet matching @@ -46,12 +45,13 @@ #define MIN_LINES_COVERAGE 0.8 #define SKIP_SNIPPETS_IF_FILE_BIGGER (1024 * 1024 * 4) #define MAX_SNIPPETS_SCANNED 2500 - +#define SNIPPETS_DEFAULT_RANGE_TOLERANCE 5 /** A maximum number of non-matched lines tolerated inside a matching range */ +#define SNIPPETS_DEFAULT_MIN_MATCH_LINES 5 /** Minimum number of lines matched for a match range to be acepted */ +#define SNIPPETS_DEFAULT_MIN_MATCH_HITS 2 /** Minimum number of snippet ID hits to produce a snippet match*/ +#define SNIPPETS_DEFAULT_ADJUST_TOLERANCE true /** Adjust tolerance based on file size */ +#define SNIPPETS_DEFAULT_HONOR_FILE_EXTENSION true /** Honor file extension during snippet matching */ +#define DEFAULT_FETCH_MAX_FILES 12000 /** Maximum number of files to fetch during component matching */ /* Variables */ - -extern int range_tolerance; // A maximum number of non-matched lines tolerated inside a matching range -extern int min_match_lines; // Minimum number of lines matched for a match range to be acepted -extern int min_match_hits; // Minimum number of snippet ID hits to produce a snippet match extern int fetch_max_files; // Maximum number of files to fetch during component matching #endif diff --git a/inc/scan.h b/inc/scan.h index 4407910..b75a564 100644 --- a/inc/scan.h +++ b/inc/scan.h @@ -66,16 +66,22 @@ typedef struct scan_data_t int max_matchmap_size; bool printed_succed; bool windows_line_endings; + bool snippet_adjust_tolerance; // Enable adjust snippet tolerance based on file size + int component_ranking_threshold; //-1 = disable ranking. 0 = all accepted + int snippet_min_hits; + int snippet_min_lines; + int snippet_range_tolerance; + int snippet_honor_file_extension; } scan_data_t; extern bool force_snippet_scan; -scan_data_t * scan_data_init(char *target, int max_snippets, int max_components); +scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension); void scan_data_free (scan_data_t * scan); void ldb_scan(scan_data_t * scan); match_t ldb_scan_snippets(scan_data_t *scan_ptr); -int wfp_scan(char * path, int scan_max_snippets, int scan_max_components); -int hash_scan(char *path, int scan_max_snippets, int scan_max_components); +int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension); +int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension); #endif diff --git a/inc/scanoss.h b/inc/scanoss.h index 16de422..73a6295 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -80,7 +80,6 @@ typedef enum {MATCH_NONE, MATCH_FILE, MATCH_SNIPPET, MATCH_BINARY} match_t; extern long microseconds_start; extern int map_rec_len; -extern bool match_extensions; /*component hint hold the last component matched/guessed */ extern char * component_hint; @@ -104,7 +103,6 @@ extern struct ldb_table oss_notices; extern bool first_file; -extern int max_vulnerabilities; extern char *ignored_assets; diff --git a/src/binary_scan.c b/src/binary_scan.c index a7f7a88..4646a36 100644 --- a/src/binary_scan.c +++ b/src/binary_scan.c @@ -270,7 +270,7 @@ int binary_scan(char * input) char * file_name = field_n(3,input); int target_len = strchr(file_name,',') - file_name; char * target = strndup(file_name, target_len); - scan_data_t * scan = scan_data_init(target, 1, 1); + scan_data_t * scan = scan_data_init(target, 1, 1, true, false, 3, 5, false); free(target); memcpy(scan->md5, bin_md5, MD5_LEN); scan->match_type = MATCH_FILE; diff --git a/src/debug.c b/src/debug.c index 270c35b..8825ebc 100644 --- a/src/debug.c +++ b/src/debug.c @@ -138,7 +138,7 @@ void scan_benchmark() for (int f = 0; f < total_files ; f++) { - scan_data_t * scan = scan_data_init("pseudo_file", 0, 0); + scan_data_t * scan = scan_data_init("pseudo_file", 0, 0, true, false, 3, 5, false); scan->preload = true; memcpy(scan->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", MD5_LEN); strcpy(scan->file_size, "1024"); diff --git a/src/help.c b/src/help.c index baa9331..96d37bf 100644 --- a/src/help.c +++ b/src/help.c @@ -50,14 +50,14 @@ Syntax: scanoss [parameters] [TARGET]\n\ Configuration:\n\ -w, --wfp Process TARGET as a .wfp file, regardless of its actual extension.\n\ -H, --hpsm Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system).\n\ --e, --extension Match only files with identical extensions as the scanned file (default: off).\n\ -M, --max-snippets NUM Search for up to NUM different components in each file (maximum: 9).\n\ -N, --max-components NUM Set maximum number of components (default: %d).\n\ -T, --tolerance NUM Set snippet scanning tolerance percentage (default: 0.1).\n\ -r, --rank NUM Set maximum component rank accepted (default: %d).\n\ --max-files NUM Set maximum number of files to fetch during matching (default: 12000).\n\ - --min-match-hits NUM Set minimum snippet ID hits for a match (default: 4).\n\ - --min-match-lines NUM Set minimum matched lines for a range (default: 10).\n\ + --min-match-hits NUM Set minimum snippet ID hits for a match (default: 3, disables auto-adjust).\n\ + --min-match-lines NUM Set minimum matched lines for a range (default: 10, disables auto-adjust).\n\ + --ignore-file-ext Ignore file extension during snippet matching (default: honor extension).\n\ -s, --sbom FILE Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ -b, --blacklist FILE Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ --force-snippet FILE Same as \"-b\" but with forced snippet scanning.\n\ diff --git a/src/limits.c b/src/limits.c index 4f9772b..67b6cfd 100644 --- a/src/limits.c +++ b/src/limits.c @@ -9,9 +9,4 @@ * @see https://github.com/scanoss/engine/blob/master/src/limits.c */ -int range_tolerance = 5; /** A maximum number of non-matched lines tolerated inside a matching range */ -int min_match_lines = 10; /** Minimum number of lines matched for a match range to be acepted */ -int min_match_hits = 4; /** Minimum number of snippet ID hits to produce a snippet match*/ int fetch_max_files = 12000; /** Maximum number of files to fetch during component matching */ - -const int max_vulnerabilities = 50; /** Show only the first N vulnerabilities */ \ No newline at end of file diff --git a/src/main.c b/src/main.c index 7ab8253..9d100e3 100644 --- a/src/main.c +++ b/src/main.c @@ -64,6 +64,12 @@ struct ldb_table oss_notices; component_item *ignore_components; component_item *declared_components; +int scan_min_match_lines = SNIPPETS_DEFAULT_MIN_MATCH_LINES; // Minimum number of lines matched for a match range to be acepted +int scan_min_match_hits = SNIPPETS_DEFAULT_MIN_MATCH_HITS; // Minimum number of snippet ID hits to produce a snippet match +bool scan_adjust_tolerance = SNIPPETS_DEFAULT_ADJUST_TOLERANCE; /** Adjust tolerance based on file size */ +int scan_ranking_threshold = 0; //enabled, all accepted by default +bool scan_honor_file_extension = SNIPPETS_DEFAULT_HONOR_FILE_EXTENSION; + bool lib_encoder_present = false; #define LDB_VER_MIN "4.1.0" @@ -208,10 +214,12 @@ void recurse_directory(char *name) if (extension(path)) if (!strcmp(extension(path), "wfp")) wfp = true; if (wfp) - wfp_scan(path, scan_max_snippets, scan_max_components); + wfp_scan(path, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); else { - scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components); + scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); ldb_scan(scan); } @@ -262,9 +270,6 @@ uint64_t read_flags() return 0; } - -int component_rank_max = COMPONENT_DEFAULT_RANK + 1; /*Used defined max component rank accepted*/ - /* Long options structure for getopt_long */ static struct option long_options[] = { {"rank", required_argument, 0, 'r'}, @@ -284,6 +289,7 @@ static struct option long_options[] = { {"max-files", required_argument, 0, 257}, /* Long option only */ {"min-match-hits", required_argument, 0, 258}, /* Long option only */ {"min-match-lines", required_argument, 0, 259}, /* Long option only */ + {"ignore-file-ext", no_argument, 0, 260}, /* Long option only */ {"wfp", no_argument, 0, 'w'}, {"test", no_argument, 0, 't'}, {"version", no_argument, 0, 'v'}, @@ -327,7 +333,7 @@ int main(int argc, char **argv) bool invalid_argument = false; char * ldb_db_name = NULL; - while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:wtLvhedqH", long_options, &option_index)) != -1) + while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:wtLvhdqH", long_options, &option_index)) != -1) { /* Check valid alpha is entered */ if (optarg) @@ -355,8 +361,8 @@ int main(int argc, char **argv) component_hint = strdup(optarg); break; case 'r': - component_rank_max = atoi(optarg); - scanlog("Max component rank set to %d\n", component_rank_max); + scan_ranking_threshold = atoi(optarg); + scanlog("Max component rank set to %d\n", scan_ranking_threshold); break; case 'k': @@ -422,10 +428,6 @@ int main(int argc, char **argv) exit(EXIT_SUCCESS); break; - case 'e': - match_extensions = true; - break; - case 'q': engine_flags = engine_flags_cmd_line; debug_on = true; @@ -451,14 +453,23 @@ int main(int argc, char **argv) case 257: /* --max-files */ fetch_max_files = atoi(optarg); + scanlog("Max files to fetch set to %d\n", fetch_max_files); break; case 258: /* --min-match-hits */ - min_match_hits = atoi(optarg); + scan_min_match_hits = atoi(optarg); + scan_adjust_tolerance = false; + scanlog("Min match hits set to %d (auto-adjust disabled)\n", scan_min_match_hits); break; case 259: /* --min-match-lines */ - min_match_lines = atoi(optarg); + scan_min_match_lines = atoi(optarg); + scan_adjust_tolerance = false; + scanlog("Min match lines set to %d (auto-adjust disabled)\n", scan_min_match_lines); + break; + case 260: /* --ignore-file-ext */ + scan_honor_file_extension = false; + scanlog("File extension matching disabled\n"); break; case 'H': @@ -528,7 +539,8 @@ int main(int argc, char **argv) { /* Init scan structure */ if (ishash) - hash_scan(target, scan_max_snippets, scan_max_components); + hash_scan(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); else { bool wfp_extension = false; @@ -541,7 +553,8 @@ int main(int argc, char **argv) /* Scan wfp file */ if (wfp_extension) - wfp_scan(target, scan_max_snippets, scan_max_components); + wfp_scan(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); else if (bfp_extension) binary_scan(target); @@ -550,12 +563,11 @@ int main(int argc, char **argv) else { scanlog("Scanning file %s\n", target); - scan_data_t * scan = scan_data_init(target, scan_max_snippets, scan_max_components); + scan_data_t * scan = scan_data_init(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); ldb_scan(scan); } } - - } /* Close main report structure */ diff --git a/src/report.c b/src/report.c index 190f8d5..639942c 100644 --- a/src/report.c +++ b/src/report.c @@ -48,6 +48,7 @@ uint64_t engine_flags = 0; char kb_version[MAX_INPUT]; +static bool ranking_enabled = false; /** * @brief Open JSON report @@ -254,6 +255,10 @@ bool print_json_component(component_data_t * component) printf(",%s", json_remove_invalid_char(component->license_text)); } + if (ranking_enabled) + printf(",\"rank\": %d", component->rank); + + if (!(engine_flags & DISABLE_HEALTH)) { if (!component->health_text) @@ -308,6 +313,7 @@ bool print_json_component(component_data_t * component) bool print_json_match(struct match_data_t * match) { + if (!match->component_list.headp.lh_first) { scanlog("Match with no components ignored: %s", match->source_md5); @@ -318,6 +324,9 @@ bool print_json_match(struct match_data_t * match) if (engine_flags & DISABLE_BEST_MATCH) printf("{"); + if (match->scan_ower->component_ranking_threshold >= 0) + ranking_enabled = true; + printf("\"id\": \"%s\"", matchtypes[match->type]); printf(",\"lines\": \"%s\"", match->line_ranges); printf(",\"oss_lines\": \"%s\"", match->oss_ranges); diff --git a/src/scan.c b/src/scan.c index 0123ab0..895d528 100644 --- a/src/scan.c +++ b/src/scan.c @@ -53,7 +53,7 @@ char *ignored_assets = NULL; @param target File to scan @return Scan data */ -scan_data_t * scan_data_init(char *target, int max_snippets, int max_components) +scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension) { scanlog("Scan Init\n"); scan_data_t * scan = calloc(1, sizeof(*scan)); @@ -62,8 +62,13 @@ scan_data_t * scan_data_init(char *target, int max_snippets, int max_components) scan->hashes = calloc(MAX_FILE_SIZE,1); scan->lines = malloc(MAX_FILE_SIZE); scan->match_type = MATCH_NONE; - + scan->component_ranking_threshold = component_ranking_threshold; + scan->snippet_adjust_tolerance = adjust_tolerance; + scan->snippet_min_hits = snippet_min_hits; + scan->snippet_min_lines = snippet_min_lines; + scan->snippet_honor_file_extension = snippet_honor_file_extension; scan->max_components_to_process = max_components; + scan->snippet_range_tolerance = SNIPPETS_DEFAULT_RANGE_TOLERANCE; scan->max_snippets_to_process = max_snippets > MAX_MULTIPLE_COMPONENTS ? MAX_MULTIPLE_COMPONENTS : max_snippets; scan->max_snippets_to_process = scan->max_snippets_to_process == 0 ? 1 : scan->max_snippets_to_process; @@ -189,9 +194,9 @@ int asset_declared(component_data_t * comp) * @param scan Scan data * @return Scan result (SUCCESS/FAILURE) |**/ -int hash_scan(char *path, int scan_max_snippets, int scan_max_components) +int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension) { - scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components); + scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components, adjust_tolerance, component_ranking_threshold, snippet_min_hits, snippet_min_lines, snippet_honor_file_extension); scan->preload = true; /* Get file MD5 */ @@ -216,7 +221,7 @@ int hash_scan(char *path, int scan_max_snippets, int scan_max_components) * @param scan_max_components Limit for component to be displayed. 1 by default. * @return EXIT_SUCCESS */ -int wfp_scan(char * path, int scan_max_snippets, int scan_max_components) +int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension) { scan_data_t * scan = NULL; char * line = NULL; @@ -303,7 +308,7 @@ int wfp_scan(char * path, int scan_max_snippets, int scan_max_components) } /*Init a new scan object for the next file to be scanned */ - scan = scan_data_init(target, scan_max_snippets, scan_max_components); + scan = scan_data_init(target, scan_max_snippets, scan_max_components, adjust_tolerance, component_ranking_threshold, snippet_min_hits, snippet_min_lines, snippet_honor_file_extension); strcpy(scan->source_md5, tmp_md5_hex); extract_csv(scan->file_size, (char *)rec, 1, LDB_MAX_REC_LN); scan->preload = true; @@ -453,6 +458,13 @@ void ldb_scan(scan_data_t *scan) scan->match_type = MATCH_NONE; scan->timer = microseconds_now(); + if (scan->component_ranking_threshold < 0) + component_rank_max = -1; // disable ranking + else if (scan->component_ranking_threshold == 0) + component_rank_max = COMPONENT_DEFAULT_RANK + 1; // all accepted + else + component_rank_max = scan->component_ranking_threshold; + /* Get file length */ uint64_t file_size = 0; diff --git a/src/snippet_selection.c b/src/snippet_selection.c index 0d29172..30e3f2e 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -121,7 +121,7 @@ void biggest_snippet(scan_data_t *scan) if (j < 0) continue; - if (scan->matchmap[j].hits >= min_match_hits) /* Only consider file with more than min_match_hits */ + if (scan->matchmap[j].hits >= scan->snippet_min_hits) /* Only consider file with more than min_match_hits */ { match_data_t *match_new = calloc(1, sizeof(match_data_t)); /* Create a match object */ memcpy(match_new->file_md5, scan->matchmap[j].md5, oss_file.key_ln); @@ -133,14 +133,14 @@ void biggest_snippet(scan_data_t *scan) match_new->scan_ower = scan; int i = 0; - if (snippet_extension_discard(match_new)) + if (scan->snippet_honor_file_extension && snippet_extension_discard(match_new)) { match_data_free(match_new); continue; } int matched_lines = compile_ranges(match_new); - if (matched_lines < min_match_lines) { + if (matched_lines < scan->snippet_min_lines) { match_data_free(match_new); continue; } @@ -243,7 +243,7 @@ void add_snippet_ids(match_data_t *match, char *snippet_ids, long from, long to) * @param scan[out] pointer to scan data * @return hits */ -int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges) +int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges, int min_match_lines) { int out = 0; /* Walk ranges */ @@ -257,14 +257,14 @@ int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges) { if (from == 0) from = 1; + //discard snippets below the limit of detection + if (to - from < min_match_lines) + continue; /* Add commas unless it is the first range */ if (*line_ranges) strcat(line_ranges, ","); if (*oss_ranges) strcat(oss_ranges, ","); - //discard snippets below the limit of detection - if (to - from < min_match_lines) - continue; /* Add from-to values */ sprintf(line_ranges + strlen(line_ranges), "%d-%d", from, to); @@ -291,7 +291,7 @@ int range_comp(const void *a, const void *b) * @brief Join overlapping ranges * @param ranges ranges list to process */ -matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) +matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int range_tolerance) { matchmap_range *out_ranges = malloc(sizeof(matchmap_range) * MATCHMAP_RANGES); @@ -368,7 +368,7 @@ uint32_t compile_ranges(match_data_t *match) if (debug_on) { - scanlog("Accepted ranges (min lines range = %d):\n", min_match_lines); + scanlog("Accepted ranges (min lines range = %d):\n", match->scan_ower->snippet_min_lines); for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) { if ( match->matchmap_reg->range[i].from && match->matchmap_reg->range[i].to) @@ -377,7 +377,7 @@ uint32_t compile_ranges(match_data_t *match) } } - matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number); + matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number, match->scan_ower->snippet_range_tolerance); if (engine_flags & ENABLE_SNIPPET_IDS) { @@ -399,7 +399,7 @@ uint32_t compile_ranges(match_data_t *match) scanlog(" %d = %ld to %ld - OSS from: %d\n", i, ranges[i].from, ranges[i].to, ranges[i].oss_line); } } - hits = ranges_assemble(ranges, line_ranges, oss_ranges); + hits = ranges_assemble(ranges, line_ranges, oss_ranges, match->scan_ower->snippet_min_lines); match->line_ranges = strdup(line_ranges); match->oss_ranges = strdup(oss_ranges); match->snippet_ids = strdup(snippet_ids); diff --git a/src/snippets.c b/src/snippets.c index c25c366..1cffff6 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -95,11 +95,7 @@ bool skip_snippets(char *src, uint64_t srcln) scanlog("Skipping snippets: Binary file\n"); return true; // is binary } - /*if (unwanted_header(src)) - { - scanlog("Skipping snippets: Ignored contents\n"); - return true; - }*/ + return false; } @@ -111,18 +107,12 @@ static void adjust_tolerance(scan_data_t *scan) { bool skip = false; uint32_t wfpcount = scan->hash_count; + int range_tolerance = SNIPPETS_DEFAULT_RANGE_TOLERANCE; /** A maximum number of non-matched lines tolerated inside a matching range */ + int min_match_lines = SNIPPETS_DEFAULT_MIN_MATCH_LINES; /** Minimum number of lines matched for a match range to be acepted */ + int min_match_hits = SNIPPETS_DEFAULT_MIN_MATCH_HITS; /** Minimum number of snippet ID hits to produce a snippet match*/ - if (!wfpcount) - skip = true; - else if (scan->lines[wfpcount - 1] < 10) - skip = true; - - if (skip) - { - min_match_lines = 5; - min_match_hits = 2; - } - else + + if (wfpcount && scan->lines[wfpcount - 1] > SNIPPETS_DEFAULT_MIN_MATCH_LINES * 2) { /* Range tolerance is the maximum amount of non-matched lines accepted within a matched range. This goes from 21 in small files to 5 in large files */ @@ -142,7 +132,9 @@ static void adjust_tolerance(scan_data_t *scan) if (min_match_hits > 9) min_match_hits = 9; } - + scan->snippet_min_hits = min_match_hits; + scan->snippet_min_lines = min_match_lines; + scan->snippet_range_tolerance = range_tolerance; scanlog("Match hits: %d, Tolerance: range=%d, lines=%d, wfpcount=%u\n", min_match_hits, range_tolerance, min_match_lines, wfpcount); } @@ -278,7 +270,7 @@ int add_file_to_matchmap(scan_data_t *scan, matchmap_entry_t *item, uint8_t *md5 } /* Increase range */ - else if (gap < range_tolerance) + else if (gap < scan->snippet_range_tolerance) { range_found = true; /* Update range start (from) */ @@ -338,7 +330,8 @@ match_t ldb_scan_snippets(scan_data_t *scan) return MATCH_NONE; matchmap_setup(scan); - adjust_tolerance(scan); + if (scan->snippet_adjust_tolerance) + adjust_tolerance(scan); /* First build a map with all the MD5s related with each WFP from the source file*/ matchmap_entry_t map[scan->hash_count]; @@ -381,7 +374,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) memset(map_indirection_index, 0, sizeof(map_indirection_index)); scanlog ("< Snippet scan setup: Total lines: %d ,Matchmap size: %d, Min hits: %d, Min lines: %d, Map max size = %d, Cat N = %d x %d, Cat size = %d >\n", - scan->total_lines, scan->max_matchmap_size, min_match_hits, min_match_lines, map_max_size, MAP_INDIRECTION_CAT_NUMBER, map_indedirection_items_size, MAP_INDIRECTION_CAT_SIZE); + scan->total_lines, scan->max_matchmap_size, scan->snippet_min_hits, scan->snippet_min_lines, map_max_size, MAP_INDIRECTION_CAT_NUMBER, map_indedirection_items_size, MAP_INDIRECTION_CAT_SIZE); for (int i =0; i < scan->hash_count; i++) { @@ -484,7 +477,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) for (int sector = 0; sector < 256; sector++) { scan->matchmap_rank_by_sector[sector] = -1; - int sector_max = min_match_hits; + int sector_max = scan->snippet_min_hits; for (int cat = 0; cat < cat_limit_index; cat++) { /* travel the cathegories map*/ @@ -555,7 +548,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) { int wfp_p = wfp_index * WFP_REC_LN; int sector = md5s[wfp_p]; - int sector_max = min_match_hits; + int sector_max = scan->snippet_min_hits; if (scan->matchmap_rank_by_sector[sector] < 0) continue; diff --git a/src/url.c b/src/url.c index 19fa9e4..23d76f4 100644 --- a/src/url.c +++ b/src/url.c @@ -51,6 +51,8 @@ * @param ptr //TODO * @return //TODO */ +int component_rank_max = COMPONENT_DEFAULT_RANK + 1; /*Used defined max component rank accepted*/ + bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { if (!datalen && datalen >= MAX_PATH) return false; @@ -71,7 +73,12 @@ bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra component_data_t * new_comp = calloc(1, sizeof(*new_comp)); bool result = fill_component(new_comp, NULL, NULL, (uint8_t*) data); scanlog("URL MATCH: %s\n", data); - if (result && new_comp->rank <= component_rank_max) + if (!result || (component_rank_max > 0 && new_comp->rank > component_rank_max)) + { + scanlog("ignoring component with rank %d\n", new_comp->rank); + component_data_free(new_comp); + } + else { /* Save match component id */ memcpy(new_comp->url_md5, key, LDB_KEY_LN); @@ -85,11 +92,6 @@ bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra asset_declared(new_comp); component_list_add(component_list, new_comp, component_date_comparation, true); } - else - { - scanlog("ignoring component with rank %d\n", new_comp->rank); - component_data_free(new_comp); - } free(data); return false; } @@ -341,7 +343,7 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, { component_data_t * comp = calloc(1, sizeof(*comp)); bool result = fill_component(comp, key, NULL, (uint8_t *)url); - if (!result || comp->rank > component_rank_max) + if (!result || (component_rank_max > 0 && comp->rank > component_rank_max)) { scanlog("ignoring component with rank %d\n", comp->rank); free(url); diff --git a/src/vulnerability.c b/src/vulnerability.c index 3765014..5673023 100644 --- a/src/vulnerability.c +++ b/src/vulnerability.c @@ -40,6 +40,7 @@ #include "versions.h" /** @brief //TODO */ const char *vulnerability_sources[] = {"nvd", "github_advisories"}; +const int max_vulnerabilities = 50; /** Show only the first N vulnerabilities */ /** From 82dee4a19c07ebc27e795920bda5decf78c8b095 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Fri, 2 Jan 2026 10:17:51 +0000 Subject: [PATCH 10/20] add rank filtered status to report --- inc/component.h | 4 +++- src/report.c | 9 +++++++++ src/snippets.c | 1 - src/url.c | 18 +++++++++++++++--- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/inc/component.h b/inc/component.h index 9616e48..9748c10 100644 --- a/inc/component.h +++ b/inc/component.h @@ -19,11 +19,13 @@ extern int component_rank_max; * */ enum { + IDENTIFIED_FILTERED = -1, IDENTIFIED_NONE = 0, IDENTIFIED_PURL, - IDENTIFIED_PURL_VERSION + IDENTIFIED_PURL_VERSION, }; + typedef struct component_data_t { char * vendor; /* component vendor */ diff --git a/src/report.c b/src/report.c index 639942c..8c30540 100644 --- a/src/report.c +++ b/src/report.c @@ -214,6 +214,15 @@ bool print_json_component(component_data_t * component) printf("{"); else printf(","); + //if the component is filtered just report the rank without extra details. + if (component->identified == IDENTIFIED_FILTERED) + { + printf("\"status\": \"filtered\""); + printf(",\"rank\": %d", component->rank); + if (engine_flags & DISABLE_BEST_MATCH) + printf("}"); + return false; + } /* Fetch related purls */ fetch_related_purls(component); diff --git a/src/snippets.c b/src/snippets.c index 1cffff6..7c5f9ab 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -105,7 +105,6 @@ bool skip_snippets(char *src, uint64_t srcln) */ static void adjust_tolerance(scan_data_t *scan) { - bool skip = false; uint32_t wfpcount = scan->hash_count; int range_tolerance = SNIPPETS_DEFAULT_RANGE_TOLERANCE; /** A maximum number of non-matched lines tolerated inside a matching range */ int min_match_lines = SNIPPETS_DEFAULT_MIN_MATCH_LINES; /** Minimum number of lines matched for a match range to be acepted */ diff --git a/src/url.c b/src/url.c index 23d76f4..cbb21c1 100644 --- a/src/url.c +++ b/src/url.c @@ -73,9 +73,8 @@ bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra component_data_t * new_comp = calloc(1, sizeof(*new_comp)); bool result = fill_component(new_comp, NULL, NULL, (uint8_t*) data); scanlog("URL MATCH: %s\n", data); - if (!result || (component_rank_max > 0 && new_comp->rank > component_rank_max)) + if (!result) { - scanlog("ignoring component with rank %d\n", new_comp->rank); component_data_free(new_comp); } else @@ -90,6 +89,12 @@ bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra new_comp->file_md5_ref = component_list->match_ref->file_md5; new_comp->identified = IDENTIFIED_NONE; asset_declared(new_comp); + if (component_rank_max > 0 && new_comp->rank > component_rank_max) + { + scanlog("Setting component with rank %d as filtered\n", new_comp->rank); + new_comp->identified = IDENTIFIED_FILTERED; + } + component_list_add(component_list, new_comp, component_date_comparation, true); } free(data); @@ -343,7 +348,7 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, { component_data_t * comp = calloc(1, sizeof(*comp)); bool result = fill_component(comp, key, NULL, (uint8_t *)url); - if (!result || (component_rank_max > 0 && comp->rank > component_rank_max)) + if (!result) { scanlog("ignoring component with rank %d\n", comp->rank); free(url); @@ -354,6 +359,13 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, asset_declared(comp); purl_latest_version_add(comp); + if (component_rank_max > 0 && comp->rank > component_rank_max) + { + scanlog("Setting component with rank %d as filtered\n", comp->rank); + comp->identified = IDENTIFIED_FILTERED; + } + + if (!comp_oldest) { *comp_address = comp; From a8054dd7dd2df21dc6964ffae2208b98d1fbeab4 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Mon, 5 Jan 2026 14:16:56 +0000 Subject: [PATCH 11/20] add tolerance flag --- inc/scan.h | 6 +++--- src/binary_scan.c | 2 +- src/debug.c | 2 +- src/main.c | 32 ++++++++++++++++++++------------ src/scan.c | 12 ++++++------ 5 files changed, 31 insertions(+), 23 deletions(-) diff --git a/inc/scan.h b/inc/scan.h index b75a564..b147a37 100644 --- a/inc/scan.h +++ b/inc/scan.h @@ -76,12 +76,12 @@ typedef struct scan_data_t extern bool force_snippet_scan; -scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension); +scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension); void scan_data_free (scan_data_t * scan); void ldb_scan(scan_data_t * scan); match_t ldb_scan_snippets(scan_data_t *scan_ptr); -int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension); -int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension); +int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension); +int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension); #endif diff --git a/src/binary_scan.c b/src/binary_scan.c index 4646a36..2008c90 100644 --- a/src/binary_scan.c +++ b/src/binary_scan.c @@ -270,7 +270,7 @@ int binary_scan(char * input) char * file_name = field_n(3,input); int target_len = strchr(file_name,',') - file_name; char * target = strndup(file_name, target_len); - scan_data_t * scan = scan_data_init(target, 1, 1, true, false, 3, 5, false); + scan_data_t * scan = scan_data_init(target, 1, 1, true, 0, 3, 5, SNIPPETS_DEFAULT_RANGE_TOLERANCE, false); free(target); memcpy(scan->md5, bin_md5, MD5_LEN); scan->match_type = MATCH_FILE; diff --git a/src/debug.c b/src/debug.c index 8825ebc..e65c9c6 100644 --- a/src/debug.c +++ b/src/debug.c @@ -138,7 +138,7 @@ void scan_benchmark() for (int f = 0; f < total_files ; f++) { - scan_data_t * scan = scan_data_init("pseudo_file", 0, 0, true, false, 3, 5, false); + scan_data_t * scan = scan_data_init("pseudo_file", 0, 0, true, 0, 3, 5, SNIPPETS_DEFAULT_RANGE_TOLERANCE, false); scan->preload = true; memcpy(scan->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", MD5_LEN); strcpy(scan->file_size, "1024"); diff --git a/src/main.c b/src/main.c index 9d100e3..385bfc8 100644 --- a/src/main.c +++ b/src/main.c @@ -66,6 +66,7 @@ component_item *declared_components; int scan_min_match_lines = SNIPPETS_DEFAULT_MIN_MATCH_LINES; // Minimum number of lines matched for a match range to be acepted int scan_min_match_hits = SNIPPETS_DEFAULT_MIN_MATCH_HITS; // Minimum number of snippet ID hits to produce a snippet match +int scan_range_tolerance = SNIPPETS_DEFAULT_RANGE_TOLERANCE; // Maximum number of non-matched lines tolerated inside a matching range bool scan_adjust_tolerance = SNIPPETS_DEFAULT_ADJUST_TOLERANCE; /** Adjust tolerance based on file size */ int scan_ranking_threshold = 0; //enabled, all accepted by default bool scan_honor_file_extension = SNIPPETS_DEFAULT_HONOR_FILE_EXTENSION; @@ -214,12 +215,12 @@ void recurse_directory(char *name) if (extension(path)) if (!strcmp(extension(path), "wfp")) wfp = true; if (wfp) - wfp_scan(path, scan_max_snippets, scan_max_components, scan_adjust_tolerance, - scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); + wfp_scan(path, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); else { - scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components, scan_adjust_tolerance, - scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); + scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); ldb_scan(scan); } @@ -290,6 +291,7 @@ static struct option long_options[] = { {"min-match-hits", required_argument, 0, 258}, /* Long option only */ {"min-match-lines", required_argument, 0, 259}, /* Long option only */ {"ignore-file-ext", no_argument, 0, 260}, /* Long option only */ + {"range-tolerance", required_argument, 0, 261}, /* Long option only */ {"wfp", no_argument, 0, 'w'}, {"test", no_argument, 0, 't'}, {"version", no_argument, 0, 'v'}, @@ -467,11 +469,17 @@ int main(int argc, char **argv) scan_adjust_tolerance = false; scanlog("Min match lines set to %d (auto-adjust disabled)\n", scan_min_match_lines); break; + case 260: /* --ignore-file-ext */ scan_honor_file_extension = false; scanlog("File extension matching disabled\n"); break; + case 261: /* --range-tolerance */ + scan_range_tolerance = atoi(optarg); + scanlog("Range tolerance set to %d\n", scan_range_tolerance); + break; + case 'H': if (hpsm_lib_load()) hpsm_enabled = true; @@ -538,9 +546,9 @@ int main(int argc, char **argv) else { /* Init scan structure */ - if (ishash) - hash_scan(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, - scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); + if (ishash) + hash_scan(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); else { bool wfp_extension = false; @@ -552,9 +560,9 @@ int main(int argc, char **argv) if (force_bfp) bfp_extension = true; /* Scan wfp file */ - if (wfp_extension) - wfp_scan(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, - scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); + if (wfp_extension) + wfp_scan(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); else if (bfp_extension) binary_scan(target); @@ -563,8 +571,8 @@ int main(int argc, char **argv) else { scanlog("Scanning file %s\n", target); - scan_data_t * scan = scan_data_init(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, - scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_honor_file_extension); + scan_data_t * scan = scan_data_init(target, scan_max_snippets, scan_max_components, scan_adjust_tolerance, + scan_ranking_threshold, scan_min_match_hits, scan_min_match_lines, scan_range_tolerance, scan_honor_file_extension); ldb_scan(scan); } } diff --git a/src/scan.c b/src/scan.c index 895d528..1717ab9 100644 --- a/src/scan.c +++ b/src/scan.c @@ -53,7 +53,7 @@ char *ignored_assets = NULL; @param target File to scan @return Scan data */ -scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension) +scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension) { scanlog("Scan Init\n"); scan_data_t * scan = calloc(1, sizeof(*scan)); @@ -68,7 +68,7 @@ scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, scan->snippet_min_lines = snippet_min_lines; scan->snippet_honor_file_extension = snippet_honor_file_extension; scan->max_components_to_process = max_components; - scan->snippet_range_tolerance = SNIPPETS_DEFAULT_RANGE_TOLERANCE; + scan->snippet_range_tolerance = snippet_range_tolerance; scan->max_snippets_to_process = max_snippets > MAX_MULTIPLE_COMPONENTS ? MAX_MULTIPLE_COMPONENTS : max_snippets; scan->max_snippets_to_process = scan->max_snippets_to_process == 0 ? 1 : scan->max_snippets_to_process; @@ -194,9 +194,9 @@ int asset_declared(component_data_t * comp) * @param scan Scan data * @return Scan result (SUCCESS/FAILURE) |**/ -int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension) +int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension) { - scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components, adjust_tolerance, component_ranking_threshold, snippet_min_hits, snippet_min_lines, snippet_honor_file_extension); + scan_data_t * scan = scan_data_init(path, scan_max_snippets, scan_max_components, adjust_tolerance, component_ranking_threshold, snippet_min_hits, snippet_min_lines, snippet_range_tolerance, snippet_honor_file_extension); scan->preload = true; /* Get file MD5 */ @@ -221,7 +221,7 @@ int hash_scan(char *path, int scan_max_snippets, int scan_max_components, bool a * @param scan_max_components Limit for component to be displayed. 1 by default. * @return EXIT_SUCCESS */ -int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, bool snippet_honor_file_extension) +int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool adjust_tolerance, int component_ranking_threshold, int snippet_min_hits, int snippet_min_lines, int snippet_range_tolerance, bool snippet_honor_file_extension) { scan_data_t * scan = NULL; char * line = NULL; @@ -308,7 +308,7 @@ int wfp_scan(char * path, int scan_max_snippets, int scan_max_components, bool a } /*Init a new scan object for the next file to be scanned */ - scan = scan_data_init(target, scan_max_snippets, scan_max_components, adjust_tolerance, component_ranking_threshold, snippet_min_hits, snippet_min_lines, snippet_honor_file_extension); + scan = scan_data_init(target, scan_max_snippets, scan_max_components, adjust_tolerance, component_ranking_threshold, snippet_min_hits, snippet_min_lines, snippet_range_tolerance, snippet_honor_file_extension); strcpy(scan->source_md5, tmp_md5_hex); extract_csv(scan->file_size, (char *)rec, 1, LDB_MAX_REC_LN); scan->preload = true; From d1ac3cbb2fd0cf39fab607e70e8489c569c67db7 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Mon, 5 Jan 2026 15:14:36 +0000 Subject: [PATCH 12/20] update readme and help --- README.md | 4 ++++ src/help.c | 1 + src/main.c | 2 +- src/scan.c | 2 +- 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f8ff79d..b8c9cf5 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ Syntax: scanoss [parameters] [TARGET] * `--max-files NUM` - Set maximum number of files to fetch during matching (default: 12000) * `--min-match-hits NUM` - Set minimum snippet ID hits for a match (default: 3, disables auto-adjust) * `--min-match-lines NUM` - Set minimum matched lines for a range (default: 10, disables auto-adjust) +* `--range-tolerance NUM` - Set max non-matched lines tolerated in a range (default: 5) * `--ignore-file-ext` - Ignore file extension during snippet matching (default: honor extension) ### SBOM and Filtering @@ -117,6 +118,9 @@ scanoss --sbom my_sbom.json TARGET # Scan with custom snippet matching parameters scanoss --min-match-hits 5 --min-match-lines 15 TARGET +# Scan with custom range tolerance +scanoss --range-tolerance 10 TARGET + # Ignore file extensions during matching scanoss --ignore-file-ext TARGET ``` diff --git a/src/help.c b/src/help.c index 96d37bf..a585fe9 100644 --- a/src/help.c +++ b/src/help.c @@ -57,6 +57,7 @@ Configuration:\n\ --max-files NUM Set maximum number of files to fetch during matching (default: 12000).\n\ --min-match-hits NUM Set minimum snippet ID hits for a match (default: 3, disables auto-adjust).\n\ --min-match-lines NUM Set minimum matched lines for a range (default: 10, disables auto-adjust).\n\ + --range-tolerance NUM Set max non-matched lines tolerated in a range (default: 5).\n\ --ignore-file-ext Ignore file extension during snippet matching (default: honor extension).\n\ -s, --sbom FILE Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ -b, --blacklist FILE Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ diff --git a/src/main.c b/src/main.c index 385bfc8..8c844b7 100644 --- a/src/main.c +++ b/src/main.c @@ -296,7 +296,6 @@ static struct option long_options[] = { {"test", no_argument, 0, 't'}, {"version", no_argument, 0, 'v'}, {"help", no_argument, 0, 'h'}, - {"extension", no_argument, 0, 'e'}, {"debug", no_argument, 0, 'd'}, {"quiet", no_argument, 0, 'q'}, {"hpsm", no_argument, 0, 'H'}, @@ -477,6 +476,7 @@ int main(int argc, char **argv) case 261: /* --range-tolerance */ scan_range_tolerance = atoi(optarg); + scan_adjust_tolerance = false; scanlog("Range tolerance set to %d\n", scan_range_tolerance); break; diff --git a/src/scan.c b/src/scan.c index 1717ab9..33c85ec 100644 --- a/src/scan.c +++ b/src/scan.c @@ -68,7 +68,7 @@ scan_data_t * scan_data_init(char *target, int max_snippets, int max_components, scan->snippet_min_lines = snippet_min_lines; scan->snippet_honor_file_extension = snippet_honor_file_extension; scan->max_components_to_process = max_components; - scan->snippet_range_tolerance = snippet_range_tolerance; + scan->snippet_range_tolerance = snippet_range_tolerance > 0 ? snippet_range_tolerance : 1; scan->max_snippets_to_process = max_snippets > MAX_MULTIPLE_COMPONENTS ? MAX_MULTIPLE_COMPONENTS : max_snippets; scan->max_snippets_to_process = scan->max_snippets_to_process == 0 ? 1 : scan->max_snippets_to_process; From 59a6a3f90ef7193af219a2de7a807f9b0a9559d3 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Fri, 9 Jan 2026 19:51:20 +0100 Subject: [PATCH 13/20] rename command options --- inc/match.h | 1 - src/main.c | 4 ++-- src/report.c | 5 ++++- src/snippet_selection.c | 36 ++++++++++++++++++++++-------------- 4 files changed, 28 insertions(+), 18 deletions(-) diff --git a/inc/match.h b/inc/match.h index a807919..e1edced 100644 --- a/inc/match.h +++ b/inc/match.h @@ -25,7 +25,6 @@ typedef struct match_data_t uint32_t * crclist; /* pointer to crc list used in for processing */ char * quality_text; /* quality string used in json output format */ char * crytography_text; /* crytography string used in json output format */ - uint16_t from; } match_data_t; match_data_t * match_data_copy(match_data_t * in); diff --git a/src/main.c b/src/main.c index 8c844b7..58af1ac 100644 --- a/src/main.c +++ b/src/main.c @@ -288,8 +288,8 @@ static struct option long_options[] = { {"max-snippets", required_argument, 0, 'M'}, {"max-components", required_argument, 0, 'N'}, {"max-files", required_argument, 0, 257}, /* Long option only */ - {"min-match-hits", required_argument, 0, 258}, /* Long option only */ - {"min-match-lines", required_argument, 0, 259}, /* Long option only */ + {"min-snippet-hits", required_argument, 0, 258}, /* Long option only */ + {"min-snippet-lines", required_argument, 0, 259}, /* Long option only */ {"ignore-file-ext", no_argument, 0, 260}, /* Long option only */ {"range-tolerance", required_argument, 0, 261}, /* Long option only */ {"wfp", no_argument, 0, 'w'}, diff --git a/src/report.c b/src/report.c index 8c30540..fb1cc37 100644 --- a/src/report.c +++ b/src/report.c @@ -336,7 +336,10 @@ bool print_json_match(struct match_data_t * match) if (match->scan_ower->component_ranking_threshold >= 0) ranking_enabled = true; - printf("\"id\": \"%s\"", matchtypes[match->type]); + printf("\"id\": \"%s\"", matchtypes[match->type]); + if (!match->scan_ower->snippet_adjust_tolerance && match->type == MATCH_SNIPPET) + printf(",\"hits\": %d", match->hits); + printf(",\"lines\": \"%s\"", match->line_ranges); printf(",\"oss_lines\": \"%s\"", match->oss_ranges); printf(",\"matched\": \"%d%%\"", match->matched_percent); diff --git a/src/snippet_selection.c b/src/snippet_selection.c index 30e3f2e..daa9e8b 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -128,7 +128,6 @@ void biggest_snippet(scan_data_t *scan) match_new->hits = scan->matchmap[j].hits; match_new->matchmap_reg = &scan->matchmap[j]; match_new->type = scan->match_type; - match_new->from = scan->matchmap[j].range->from; strcpy(match_new->source_md5, scan->source_md5); match_new->scan_ower = scan; int i = 0; @@ -243,11 +242,11 @@ void add_snippet_ids(match_data_t *match, char *snippet_ids, long from, long to) * @param scan[out] pointer to scan data * @return hits */ -int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges, int min_match_lines) +int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges, int min_match_lines, int ranges_number) { int out = 0; /* Walk ranges */ - for (int i = 0; i < MATCHMAP_RANGES; i++) + for (int i = 0; i < ranges_number; i++) { int to = ranges[i].to; int from = ranges[i].from; @@ -291,9 +290,13 @@ int range_comp(const void *a, const void *b) * @brief Join overlapping ranges * @param ranges ranges list to process */ -matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int range_tolerance) +matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int range_tolerance, bool fixed_ranges) { - matchmap_range *out_ranges = malloc(sizeof(matchmap_range) * MATCHMAP_RANGES); + int out_size = MATCHMAP_RANGES; + if (fixed_ranges) + out_size = size; + + matchmap_range *out_ranges = calloc(out_size, sizeof(matchmap_range)); int processed = 0; int tolerance = range_tolerance > 0 ? range_tolerance : 1; @@ -301,8 +304,7 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int r { int out_ranges_index = -1; processed = 0; - out_ranges[0] = ranges[0]; - memset(out_ranges, 0, sizeof(matchmap_range) * MATCHMAP_RANGES); + memset(out_ranges, 0, sizeof(matchmap_range) * out_size); scanlog("Range tolerance: %d\n", tolerance); for (int i = 0; i < size; i++) { @@ -319,7 +321,7 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int r else { out_ranges_index++; - if (out_ranges_index == MATCHMAP_RANGES) + if (out_ranges_index == MATCHMAP_RANGES && !fixed_ranges) break; out_ranges[out_ranges_index].from = ranges[i].from; out_ranges[out_ranges_index].to = ranges[i].to; @@ -328,6 +330,8 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int r processed++; } } + if (fixed_ranges) + break; tolerance *= 2; } @@ -352,7 +356,11 @@ void ranges_sort(matchmap_range *ranges, int size) */ uint32_t compile_ranges(match_data_t *match) { - + if (match->matchmap_reg->ranges_number <= 0) + { + scanlog("No ranges to compile\n"); + return 0; + } char line_ranges[MAX_FIELD_LN * 2] = "\0"; char oss_ranges[MAX_FIELD_LN * 2] = "\0"; char snippet_ids[MAX_SNIPPET_IDS_RETURNED * WFP_LN * 2 + MATCHMAP_RANGES + 1] = "\0"; @@ -377,11 +385,11 @@ uint32_t compile_ranges(match_data_t *match) } } - matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number, match->scan_ower->snippet_range_tolerance); - + matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number, match->scan_ower->snippet_range_tolerance, !match->scan_ower->snippet_adjust_tolerance); + int ranges_number = !match->scan_ower->snippet_adjust_tolerance ? match->matchmap_reg->ranges_number : MATCHMAP_RANGES; if (engine_flags & ENABLE_SNIPPET_IDS) { - for (int range = 0; range < MATCHMAP_RANGES; range++) + for (int range = 0; range < ranges_number; range++) { if (!ranges[range].from && !ranges[range].to) break; @@ -393,13 +401,13 @@ uint32_t compile_ranges(match_data_t *match) if (debug_on) { scanlog("Final ranges:\n"); - for (uint32_t i = 0; i < MATCHMAP_RANGES; i++) + for (uint32_t i = 0; i < ranges_number; i++) { if ( ranges[i].from && ranges[i].to) scanlog(" %d = %ld to %ld - OSS from: %d\n", i, ranges[i].from, ranges[i].to, ranges[i].oss_line); } } - hits = ranges_assemble(ranges, line_ranges, oss_ranges, match->scan_ower->snippet_min_lines); + hits = ranges_assemble(ranges, line_ranges, oss_ranges, match->scan_ower->snippet_min_lines, ranges_number); match->line_ranges = strdup(line_ranges); match->oss_ranges = strdup(oss_ranges); match->snippet_ids = strdup(snippet_ids); From 2713bfb17402cb40f34744f48f137fc58aaf5f3b Mon Sep 17 00:00:00 2001 From: mscasso-scanoss Date: Mon, 12 Jan 2026 12:31:33 +0100 Subject: [PATCH 14/20] update version, update help, disable dynamic ranges --- inc/scanoss.h | 2 +- src/help.c | 6 +++--- src/snippet_selection.c | 13 +++++++------ 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/inc/scanoss.h b/inc/scanoss.h index 73a6295..8c4bbc4 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -33,7 +33,7 @@ #define WFP_LN 4 #define WFP_REC_LN 18 -#define SCANOSS_VERSION "5.4.19" +#define SCANOSS_VERSION "5.4.20" /* Log files */ #define SCAN_LOG "/tmp/scanoss_scan.log" diff --git a/src/help.c b/src/help.c index a585fe9..075f9fb 100644 --- a/src/help.c +++ b/src/help.c @@ -55,13 +55,13 @@ Configuration:\n\ -T, --tolerance NUM Set snippet scanning tolerance percentage (default: 0.1).\n\ -r, --rank NUM Set maximum component rank accepted (default: %d).\n\ --max-files NUM Set maximum number of files to fetch during matching (default: 12000).\n\ - --min-match-hits NUM Set minimum snippet ID hits for a match (default: 3, disables auto-adjust).\n\ - --min-match-lines NUM Set minimum matched lines for a range (default: 10, disables auto-adjust).\n\ + --min-snippet-hits NUM Set minimum snippet ID hits for a match (default: 3, disables auto-adjust).\n\ + --min-snippet-lines NUM Set minimum matched lines for a range (default: 10, disables auto-adjust).\n\ --range-tolerance NUM Set max non-matched lines tolerated in a range (default: 5).\n\ --ignore-file-ext Ignore file extension during snippet matching (default: honor extension).\n\ -s, --sbom FILE Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ -b, --blacklist FILE Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ - --force-snippet FILE Same as \"-b\" but with forced snippet scanning.\n\ + --force-snippet Force snippet scanning (no full file matching).\n\ -a, --attribution FILE Show attribution notices for the provided SBOM.json file.\n\ -c, --component HINT Add a component HINT to guide scan results.\n\ -k, --key KEY Show contents of the specified KEY file from MZ sources archive.\n\ diff --git a/src/snippet_selection.c b/src/snippet_selection.c index daa9e8b..a16d12a 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -242,7 +242,7 @@ void add_snippet_ids(match_data_t *match, char *snippet_ids, long from, long to) * @param scan[out] pointer to scan data * @return hits */ -int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges, int min_match_lines, int ranges_number) +int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges, int min_range_lines, int ranges_number) { int out = 0; /* Walk ranges */ @@ -257,7 +257,7 @@ int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges, if (from == 0) from = 1; //discard snippets below the limit of detection - if (to - from < min_match_lines) + if (to - from < min_range_lines) continue; /* Add commas unless it is the first range */ if (*line_ranges) @@ -290,10 +290,11 @@ int range_comp(const void *a, const void *b) * @brief Join overlapping ranges * @param ranges ranges list to process */ -matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int range_tolerance, bool fixed_ranges) +matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int range_tolerance, bool dynamic_ranges) { int out_size = MATCHMAP_RANGES; - if (fixed_ranges) + dynamic_ranges = false; // TODO: disable dynamic ranges for now + if (dynamic_ranges) out_size = size; matchmap_range *out_ranges = calloc(out_size, sizeof(matchmap_range)); @@ -321,7 +322,7 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int r else { out_ranges_index++; - if (out_ranges_index == MATCHMAP_RANGES && !fixed_ranges) + if (out_ranges_index == MATCHMAP_RANGES && !dynamic_ranges) break; out_ranges[out_ranges_index].from = ranges[i].from; out_ranges[out_ranges_index].to = ranges[i].to; @@ -330,7 +331,7 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int r processed++; } } - if (fixed_ranges) + if (dynamic_ranges) break; tolerance *= 2; } From c649df4f206c9bb0ed84903a3b519d9d598b16dd Mon Sep 17 00:00:00 2001 From: mscasso-scanoss Date: Mon, 12 Jan 2026 12:31:33 +0100 Subject: [PATCH 15/20] update version, update help, disable dynamic ranges --- src/snippet_selection.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/snippet_selection.c b/src/snippet_selection.c index a16d12a..0902d2c 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -293,7 +293,6 @@ int range_comp(const void *a, const void *b) matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int range_tolerance, bool dynamic_ranges) { int out_size = MATCHMAP_RANGES; - dynamic_ranges = false; // TODO: disable dynamic ranges for now if (dynamic_ranges) out_size = size; @@ -385,7 +384,7 @@ uint32_t compile_ranges(match_data_t *match) match->matchmap_reg->range[i].oss_line); } } - + match->scan_ower->snippet_adjust_tolerance = true; //TODO we will disable dynamic ranges for now. matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number, match->scan_ower->snippet_range_tolerance, !match->scan_ower->snippet_adjust_tolerance); int ranges_number = !match->scan_ower->snippet_adjust_tolerance ? match->matchmap_reg->ranges_number : MATCHMAP_RANGES; if (engine_flags & ENABLE_SNIPPET_IDS) From fdac26b223368c72917ecd732a863f978a0a4780 Mon Sep 17 00:00:00 2001 From: mscasso-scanoss Date: Wed, 14 Jan 2026 00:09:21 +0100 Subject: [PATCH 16/20] fix coderabbit issues --- inc/scan.h | 2 +- src/license.c | 31 +++++++++++++++++++------------ src/snippet_selection.c | 4 +++- src/versions.c | 2 +- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/inc/scan.h b/inc/scan.h index b147a37..1de390a 100644 --- a/inc/scan.h +++ b/inc/scan.h @@ -71,7 +71,7 @@ typedef struct scan_data_t int snippet_min_hits; int snippet_min_lines; int snippet_range_tolerance; - int snippet_honor_file_extension; + bool snippet_honor_file_extension; } scan_data_t; extern bool force_snippet_scan; diff --git a/src/license.c b/src/license.c index a2dc600..63a8903 100644 --- a/src/license.c +++ b/src/license.c @@ -46,9 +46,14 @@ bool full_license_report = false; +struct license_type +{ + char * text; + int id; +}; struct license_list { - char **licenses; + struct license_type*licenses; int count; }; @@ -91,22 +96,24 @@ static char * license_id_to_source_name(int id) } } -bool license_add_to_list(struct license_list * ptr, char * license) +bool license_add_to_list(struct license_list * ptr, char * license, int license_id) { if (!ptr || !license || strlen(license) < 2) return false; - ptr->licenses = realloc(ptr->licenses, sizeof(char *) * (ptr->count + 1)); - if (!ptr->licenses) + + struct license_type *tmp = realloc(ptr->licenses, sizeof(struct license_type) * (ptr->count + 1)); + if (!tmp) return false; + ptr->licenses = tmp; /* Allocate with extra padding for CRC32C hardware reads (8-byte blocks) */ size_t len = strlen(license); size_t padded_len = ((len + 8) / 8) * 8; /* Round up to next 8-byte boundary */ - ptr->licenses[ptr->count] = calloc(1, padded_len); - if (!ptr->licenses[ptr->count]) + ptr->licenses[ptr->count].text = calloc(1, padded_len); + if (!ptr->licenses[ptr->count].text) return false; - strcpy(ptr->licenses[ptr->count], license); - + strcpy(ptr->licenses[ptr->count].text, license); + ptr->licenses[ptr->count].id = license_id; ptr->count++; return true; } @@ -117,7 +124,7 @@ void license_free_list(struct license_list * ptr) return; for (int i = 0; i < ptr->count; i++) { - free(ptr->licenses[i]); + free(ptr->licenses[i].text); } free(ptr->licenses); ptr->licenses = NULL; @@ -383,7 +390,7 @@ bool print_licenses_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * int src = atoi(source); scanlog("Fetched License %s - source ID %d\n", license, src); - license_add_to_list(licenses, license); + license_add_to_list(licenses, license, src); free(source); free(license); @@ -418,7 +425,7 @@ void print_licenses(component_data_t *comp) /* Print URL license */ if (comp->license && strlen(comp->license) > 2) { - license_add_to_list(&licenses_by_type, comp->license); + license_add_to_list(&licenses_by_type, comp->license, 0); scanlog("License present in URL table"); } else @@ -484,7 +491,7 @@ void print_licenses(component_data_t *comp) for (int i = 0; i < licenses_by_type.count; i++) { - buffer = license_to_json(crclist, buffer, licenses_by_type.licenses[i], i, &first); + buffer = license_to_json(crclist, buffer, licenses_by_type.licenses[i].text, licenses_by_type.licenses[i].id, &first); } len = buffer - result; diff --git a/src/snippet_selection.c b/src/snippet_selection.c index 0902d2c..f93f560 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -384,7 +384,9 @@ uint32_t compile_ranges(match_data_t *match) match->matchmap_reg->range[i].oss_line); } } - match->scan_ower->snippet_adjust_tolerance = true; //TODO we will disable dynamic ranges for now. + //TODO: Re-enable dynamic ranges when feature is complete + // For now, we force adjust_tolerance to ensure stable behavior + match->scan_ower->snippet_adjust_tolerance = true; matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number, match->scan_ower->snippet_range_tolerance, !match->scan_ower->snippet_adjust_tolerance); int ranges_number = !match->scan_ower->snippet_adjust_tolerance ? match->matchmap_reg->ranges_number : MATCHMAP_RANGES; if (engine_flags & ENABLE_SNIPPET_IDS) diff --git a/src/versions.c b/src/versions.c index f97ec2c..c6bd7fe 100644 --- a/src/versions.c +++ b/src/versions.c @@ -51,7 +51,7 @@ static release_version * purl_version_list[FETCH_MAX_FILES_DEFAULT]; void purl_latest_version_add(component_data_t * component) { - if (!component->purls[0] || !component->release_date || !component->version || purl_indirection_index == fetch_max_files) + if (!component->purls[0] || !component->release_date || !component->version || purl_indirection_index == FETCH_MAX_FILES_DEFAULT) return; for (int i = 0; i < purl_indirection_index; i++) From 6af0ad94efc2566b20b67d65c6d0e05a24bf31c3 Mon Sep 17 00:00:00 2001 From: mscasso-scanoss Date: Wed, 14 Jan 2026 00:25:26 +0100 Subject: [PATCH 17/20] fix more coderabbit issues --- src/license.c | 2 +- src/match.c | 5 +++++ src/snippet_selection.c | 12 ++++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/license.c b/src/license.c index 63a8903..f32983d 100644 --- a/src/license.c +++ b/src/license.c @@ -271,7 +271,7 @@ static char *json_from_license(uint32_t *crclist, char *buffer, char *license, i char * license_source_id = license_id_to_source_name(src); if (!license_source_id) return buffer; - //skip scancode licenses starting with "license-ref" + //skip scancode licenses starting with "LicenseRef" if (!strncmp(license_source_id, "scancode", 8) && !strncmp(license, "LicenseRef", 10)) return buffer; /* Calculate CRC to avoid duplicates */ diff --git a/src/match.c b/src/match.c index 5b5c877..54c6069 100644 --- a/src/match.c +++ b/src/match.c @@ -1104,6 +1104,11 @@ void compile_matches(scan_data_t *scan) scan->matches_list_array[0] = match_list_init(true, scan->max_snippets_to_process); scan->matches_list_array_index = 1; match_data_t *match_new = calloc(1, sizeof(match_data_t)); + if (!match_new) + { + scanlog("Error allocating memory for match data\n"); + return; + } match_new->type = scan->match_type; strcpy(match_new->source_md5, scan->source_md5); memcpy(match_new->file_md5, scan->match_ptr, MD5_LEN); diff --git a/src/snippet_selection.c b/src/snippet_selection.c index f93f560..f3cd640 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -124,6 +124,11 @@ void biggest_snippet(scan_data_t *scan) if (scan->matchmap[j].hits >= scan->snippet_min_hits) /* Only consider file with more than min_match_hits */ { match_data_t *match_new = calloc(1, sizeof(match_data_t)); /* Create a match object */ + if (!match_new) + { + scanlog("Error allocating memory for match data\n"); + return; + } memcpy(match_new->file_md5, scan->matchmap[j].md5, oss_file.key_ln); match_new->hits = scan->matchmap[j].hits; match_new->matchmap_reg = &scan->matchmap[j]; @@ -297,6 +302,8 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size, int r out_size = size; matchmap_range *out_ranges = calloc(out_size, sizeof(matchmap_range)); + if (!out_ranges) + return NULL; int processed = 0; int tolerance = range_tolerance > 0 ? range_tolerance : 1; @@ -380,13 +387,14 @@ uint32_t compile_ranges(match_data_t *match) for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) { if ( match->matchmap_reg->range[i].from && match->matchmap_reg->range[i].to) - scanlog(" %d = %ld to %ld - OSS from: %d\n", i, match->matchmap_reg->range[i].from,match->matchmap_reg->range[i].to, + scanlog(" %d = %u to %u - OSS from: %d\n", i, match->matchmap_reg->range[i].from,match->matchmap_reg->range[i].to, match->matchmap_reg->range[i].oss_line); } } //TODO: Re-enable dynamic ranges when feature is complete // For now, we force adjust_tolerance to ensure stable behavior match->scan_ower->snippet_adjust_tolerance = true; + scanlog("Snippet adjust tolerance flag is being ignored\n"); matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number, match->scan_ower->snippet_range_tolerance, !match->scan_ower->snippet_adjust_tolerance); int ranges_number = !match->scan_ower->snippet_adjust_tolerance ? match->matchmap_reg->ranges_number : MATCHMAP_RANGES; if (engine_flags & ENABLE_SNIPPET_IDS) @@ -406,7 +414,7 @@ uint32_t compile_ranges(match_data_t *match) for (uint32_t i = 0; i < ranges_number; i++) { if ( ranges[i].from && ranges[i].to) - scanlog(" %d = %ld to %ld - OSS from: %d\n", i, ranges[i].from, ranges[i].to, ranges[i].oss_line); + scanlog(" %d = %u to %u - OSS from: %u\n", i, ranges[i].from, ranges[i].to, ranges[i].oss_line); } } hits = ranges_assemble(ranges, line_ranges, oss_ranges, match->scan_ower->snippet_min_lines, ranges_number); From a1c30155d2e9c78ab17d0ccdd7c61ec2fee2d357 Mon Sep 17 00:00:00 2001 From: mscasso-scanoss Date: Wed, 14 Jan 2026 13:52:40 +0100 Subject: [PATCH 18/20] fix coderabbit issues --- src/license.c | 2 ++ src/match.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/src/license.c b/src/license.c index f32983d..5f77a35 100644 --- a/src/license.c +++ b/src/license.c @@ -105,6 +105,8 @@ bool license_add_to_list(struct license_list * ptr, char * license, int license_ if (!tmp) return false; ptr->licenses = tmp; + ptr->licenses[ptr->count].text = NULL; ++ ptr->licenses[ptr->count].id = 0; /* Allocate with extra padding for CRC32C hardware reads (8-byte blocks) */ size_t len = strlen(license); diff --git a/src/match.c b/src/match.c index 54c6069..12399eb 100644 --- a/src/match.c +++ b/src/match.c @@ -261,6 +261,11 @@ static inline void initialize_component_age(component_data_t *comp) if (!comp->purls_md5[0] && comp->purls[0]) { comp->purls_md5[0] = malloc(MD5_LEN); + if (!comp->purls_md5[0]) + { + scanlog("critical: MD5 memory allocation failed"); + return; + } MD5((uint8_t *)comp->purls[0], strlen(comp->purls[0]), comp->purls_md5[0]); comp->age = get_component_age(comp->purls_md5[0]); } From 84ccb978d45dcfcd6c6f406e0c34afb8b8a0f2a4 Mon Sep 17 00:00:00 2001 From: mscasso-scanoss Date: Wed, 14 Jan 2026 14:08:45 +0100 Subject: [PATCH 19/20] fix typo --- src/license.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/license.c b/src/license.c index 5f77a35..1faae7d 100644 --- a/src/license.c +++ b/src/license.c @@ -106,7 +106,7 @@ bool license_add_to_list(struct license_list * ptr, char * license, int license_ return false; ptr->licenses = tmp; ptr->licenses[ptr->count].text = NULL; -+ ptr->licenses[ptr->count].id = 0; + ptr->licenses[ptr->count].id = 0; /* Allocate with extra padding for CRC32C hardware reads (8-byte blocks) */ size_t len = strlen(license); From f8e6f9e90ea18dd281b2dd0ca7fd8569f98517b6 Mon Sep 17 00:00:00 2001 From: mscasso-scanoss Date: Wed, 14 Jan 2026 15:31:48 +0100 Subject: [PATCH 20/20] fix license order in the report --- src/license.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/license.c b/src/license.c index 1faae7d..74a2b22 100644 --- a/src/license.c +++ b/src/license.c @@ -133,6 +133,23 @@ void license_free_list(struct license_list * ptr) ptr->count = 0; } +static int license_compare_by_id(const void *a, const void *b) +{ + const struct license_type *la = a; + const struct license_type *lb = b; + + /* IDs 5 and 6 should go to the end */ + bool a_is_last = (la->id == 5 || la->id == 6); + bool b_is_last = (lb->id == 5 || lb->id == 6); + + if (a_is_last && !b_is_last) + return 1; + if (!a_is_last && b_is_last) + return -1; + + return la->id - lb->id; +} + /** * @brief Remove invalid characters from a license name * @param license license string @@ -491,6 +508,10 @@ void print_licenses(component_data_t *comp) buffer = result + len; bool first = true; + /* Sort licenses by id (ascending) */ + if (licenses_by_type.count > 1) + qsort(licenses_by_type.licenses, licenses_by_type.count, sizeof(struct license_type), license_compare_by_id); + for (int i = 0; i < licenses_by_type.count; i++) { buffer = license_to_json(crclist, buffer, licenses_by_type.licenses[i].text, licenses_by_type.licenses[i].id, &first);