diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 6d17467acf..ab591dee8d 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -4282,36 +4282,34 @@ def is_candidate(prepared_line): return False if is_only_digit_and_punct(prepared_line): - if TRACE: - logger_debug(f'is_candidate: is_only_digit_and_punct:\n{prepared_line!r}') - return False if gibberish_detector.detect_gibberish(prepared_line): if TRACE: - logger_debug(f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}') + logger_debug( + f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}' + ) return False + lowered = prepared_line.lower() # ✅ DEFINE ONCE, ALWAYS + + # Ignore lines where (c) appears only in URL-like text + if '(c)' in lowered and 'http' in lowered: + if not copyrights_hint.years(prepared_line): + for marker in copyrights_hint.statement_markers: + if marker != '(c)' and marker in lowered: + break + else: + return False + if copyrights_hint.years(prepared_line): return True - else: - pass - prepared_line = prepared_line.lower() + for marker in copyrights_hint.statement_markers: - if marker in prepared_line: + if marker in lowered: return True - -def is_inside_statement( - chars_only_line, - markers=('copyright', 'copyrights', 'copyrightby',) + copyrights_hint.all_years, -): - """ - Return True if a line ends with some strings that indicate we are still - inside a statement. - """ - return chars_only_line and chars_only_line.endswith(markers) - + return False def is_end_of_statement(chars_only_line): """ diff --git a/tests/cluecode/test_copyrights_basic.py b/tests/cluecode/test_copyrights_basic.py index 1fbafcf548..7f30031788 100644 --- a/tests/cluecode/test_copyrights_basic.py +++ b/tests/cluecode/test_copyrights_basic.py @@ -14,7 +14,26 @@ from commoncode.testcase import FileBasedTesting from cluecode import copyrights from cluecode.copyrights import prepare_text_line -from cluecode.copyrights import remove_non_chars +from cluecode import copyrights +from cluecode.copyrights import prepare_text_line, remove_non_chars + +def test_copyright_symbol_inside_url_is_ignored(): + text = "See http://example.com/(c)/path for more information." + + prepped = prepare_text_line(text) + + # sanity check + assert '(c)' in prepped + + # URLs containing (c) should NOT be copyright candidates + assert not copyrights.is_candidate(prepped) + +def test_copyright_with_url_is_still_candidate(): + text = "Copyright (c) 2000 Foo, http://example.com" + + prepped = prepare_text_line(text) + + assert copyrights.is_candidate(prepped) class TestTextPreparation(FileBasedTesting):