From 299514792a3d36fd9dd78c9ba689d5ccd2047b10 Mon Sep 17 00:00:00 2001 From: dikshaa2909 Date: Sat, 14 Feb 2026 00:35:33 +0530 Subject: [PATCH 1/4] Ignore copyright symbols inside URLs Signed-off-by: dikshaa2909 --- src/cluecode/copyrights.py | 12 +++++++++--- tests/cluecode/test_copyrights_basic.py | 15 ++++++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 6d17467acf..b4c9d34bf1 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -4281,10 +4281,14 @@ def is_candidate(prepared_line): if not prepared_line: return False + # Ignore copyright symbols inside URLs + lowered = prepared_line.lower() + if '(c)' in lowered and ('http://' in lowered or 'https://' in lowered): + return False + if is_only_digit_and_punct(prepared_line): if TRACE: logger_debug(f'is_candidate: is_only_digit_and_punct:\n{prepared_line!r}') - return False if gibberish_detector.detect_gibberish(prepared_line): @@ -4294,13 +4298,15 @@ def is_candidate(prepared_line): if copyrights_hint.years(prepared_line): return True - else: - pass + prepared_line = prepared_line.lower() for marker in copyrights_hint.statement_markers: if marker in prepared_line: return True + return False + + def is_inside_statement( chars_only_line, diff --git a/tests/cluecode/test_copyrights_basic.py b/tests/cluecode/test_copyrights_basic.py index 1fbafcf548..61387dffee 100644 --- a/tests/cluecode/test_copyrights_basic.py +++ b/tests/cluecode/test_copyrights_basic.py @@ -14,7 +14,20 @@ from commoncode.testcase import FileBasedTesting from cluecode import copyrights from cluecode.copyrights import prepare_text_line -from cluecode.copyrights import remove_non_chars +from cluecode import copyrights +from cluecode.copyrights import prepare_text_line, remove_non_chars + +def test_copyright_symbol_inside_url_is_ignored(): + text = "See http://example.com/(c)/path for more information." + + prepped = prepare_text_line(text) + + # sanity check + assert '(c)' in prepped + + # URLs containing (c) should NOT be copyright candidates + assert not copyrights.is_candidate(prepped) + class TestTextPreparation(FileBasedTesting): From 36f05ac36d77a33b3cd983de601a1a8aede55d41 Mon Sep 17 00:00:00 2001 From: dikshaa2909 Date: Sat, 14 Feb 2026 01:35:44 +0530 Subject: [PATCH 2/4] Improve copyright symbol handling in URLs Refactor copyright symbol detection to ignore (c) only in URL paths. Signed-off-by: dikshaa2909 --- src/cluecode/copyrights.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index b4c9d34bf1..3b91d08c6e 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -4281,14 +4281,12 @@ def is_candidate(prepared_line): if not prepared_line: return False - # Ignore copyright symbols inside URLs - lowered = prepared_line.lower() - if '(c)' in lowered and ('http://' in lowered or 'https://' in lowered): - return False - - if is_only_digit_and_punct(prepared_line): - if TRACE: - logger_debug(f'is_candidate: is_only_digit_and_punct:\n{prepared_line!r}') + # Ignore (c) only when it appears inside a URL path +lowered = prepared_line.lower() +if '(c)' in lowered: + for url in re.findall(r'https?://\S+', lowered): + if '(c)' in url: + return Falseis_candidate: is_only_digit_and_punct:\n{prepared_line!r}') return False if gibberish_detector.detect_gibberish(prepared_line): From 861555f8002674692f2e22ef89ee0ab2e034e180 Mon Sep 17 00:00:00 2001 From: dikshaa2909 Date: Sat, 14 Feb 2026 01:38:06 +0530 Subject: [PATCH 3/4] Implement test for copyright with URL Add test for copyright detection with URL Signed-off-by: dikshaa2909 --- tests/cluecode/test_copyrights_basic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/cluecode/test_copyrights_basic.py b/tests/cluecode/test_copyrights_basic.py index 61387dffee..7f30031788 100644 --- a/tests/cluecode/test_copyrights_basic.py +++ b/tests/cluecode/test_copyrights_basic.py @@ -28,6 +28,12 @@ def test_copyright_symbol_inside_url_is_ignored(): # URLs containing (c) should NOT be copyright candidates assert not copyrights.is_candidate(prepped) +def test_copyright_with_url_is_still_candidate(): + text = "Copyright (c) 2000 Foo, http://example.com" + + prepped = prepare_text_line(text) + + assert copyrights.is_candidate(prepped) class TestTextPreparation(FileBasedTesting): From 06606a9a502b40c87b2a0274fa77c8ecb6f295d6 Mon Sep 17 00:00:00 2001 From: dikshaa2909 Date: Sat, 14 Feb 2026 02:13:00 +0530 Subject: [PATCH 4/4] Ignore (c) symbols inside URLs during copyright detection Signed-off-by: dikshaa2909 --- src/cluecode/copyrights.py | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 3b91d08c6e..ab591dee8d 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -4281,42 +4281,36 @@ def is_candidate(prepared_line): if not prepared_line: return False - # Ignore (c) only when it appears inside a URL path -lowered = prepared_line.lower() -if '(c)' in lowered: - for url in re.findall(r'https?://\S+', lowered): - if '(c)' in url: - return Falseis_candidate: is_only_digit_and_punct:\n{prepared_line!r}') + if is_only_digit_and_punct(prepared_line): return False if gibberish_detector.detect_gibberish(prepared_line): if TRACE: - logger_debug(f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}') + logger_debug( + f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}' + ) return False + lowered = prepared_line.lower() # ✅ DEFINE ONCE, ALWAYS + + # Ignore lines where (c) appears only in URL-like text + if '(c)' in lowered and 'http' in lowered: + if not copyrights_hint.years(prepared_line): + for marker in copyrights_hint.statement_markers: + if marker != '(c)' and marker in lowered: + break + else: + return False + if copyrights_hint.years(prepared_line): return True - prepared_line = prepared_line.lower() for marker in copyrights_hint.statement_markers: - if marker in prepared_line: + if marker in lowered: return True return False - - -def is_inside_statement( - chars_only_line, - markers=('copyright', 'copyrights', 'copyrightby',) + copyrights_hint.all_years, -): - """ - Return True if a line ends with some strings that indicate we are still - inside a statement. - """ - return chars_only_line and chars_only_line.endswith(markers) - - def is_end_of_statement(chars_only_line): """ Return True if a line ends with some strings that indicate we are at the end