From f00dd6aabe8962c10390910d5bacf532e13db54f Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Sun, 17 Dec 2017 18:08:31 +0000 Subject: [PATCH 01/13] keep target in Link so it can be copied to PageSource --- pylinkvalidator/crawler.py | 6 ++++-- pylinkvalidator/models.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py index 73a0666..531f0f8 100644 --- a/pylinkvalidator/crawler.py +++ b/pylinkvalidator/crawler.py @@ -525,6 +525,7 @@ def _get_links(self, elements, attribute, base_url_split, for element in elements: if attribute in element.attrs: url = element[attribute] + target = element.attrs.get('target', None) if not self.worker_config.strict_mode: url = url.strip() @@ -540,7 +541,7 @@ def _get_links(self, elements, attribute, base_url_split, link = Link( type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, - source_str=unicode(element)) + source_str=unicode(element), target=target) links.append(link) return links @@ -658,7 +659,8 @@ def process_links(self, page_crawl): continue page_status = self.page_statuses.get(url_split, None) - page_source = PageSource(source_url_split, link.source_str) + page_source = PageSource( + source_url_split, link.source_str, link.target) if not page_status: # We never encountered this url before diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index 228ef11..3d7d171 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -133,7 +133,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): Link = namedtuple_with_defaults( "Link", - ["type", "url_split", "original_url_split", "source_str"]) + ["type", "url_split", "original_url_split", "source_str", "target"]) PageCrawl = namedtuple_with_defaults( @@ -149,7 +149,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): PageSource = namedtuple_with_defaults( - "PageSource", ["origin", "origin_str"]) + "PageSource", ["origin", "origin_str", "target"]) ContentCheck = namedtuple_with_defaults( From 05a56b7eaa1afa3c86636566f996d6d6b81a5656 Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Sun, 17 Dec 2017 18:10:02 +0000 Subject: [PATCH 02/13] JSON reporter (use prettified output) --- pylinkvalidator/reporter.py | 63 +++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index 19f9715..f4325dc 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals, absolute_import, print_function import codecs +import json import re import smtplib import sys @@ -72,6 +73,68 @@ def _write_plain_text_report(site, config, output_files, total_time): _write_plain_text_report_single(site, config, output_files, total_time) +def _write_json_report(site, config, output_file, total_time): + start_urls = ",".join((start_url_split.geturl() for start_url_split in + site.start_url_splits)) + + total_urls = len(site.pages) + total_errors = len(site.error_pages) + + if not site.is_ok: + global_status = "ERROR" + error_summary = "with {0} error(s) ".format(total_errors) + else: + global_status = "SUCCESS" + error_summary = "" + + meta = { + "total_urls": total_urls, + "total_errors": total_errors, + "total_time": total_time, + "start_urls": start_urls, + "global_status": global_status, + "error_summary": error_summary + } + try: + avg_response_time = site.get_average_response_time() + avg_process_time = site.get_average_process_time() + meta.update({"avg_response_time": avg_response_time}) + meta.update({"avg_process_time": avg_process_time}) + except Exception: + from traceback import print_exc + print_exc() + + pages = {} + + if config.options.report_type == REPORT_TYPE_ERRORS: + pages = site.error_pages + elif config.options.report_type == REPORT_TYPE_ALL: + pages = site.pages + + res_pages = [] + + for results, resource in pages.items(): + details = { + 'fragment': results.fragment, + 'hostname': results.hostname, + 'netloc': results.netloc, + 'path': results.path, + 'port': results.port, + 'query': results.query, + 'scheme': results.scheme, + "sources": [source.origin_str for source in resource.sources], + "targets": [source.target for source in resource.sources] + } + res_pages.append(details) + + res = { + "meta": meta, + "pages": res_pages + } + output_file.write( + json.dumps(res, sort_keys=True, indent=4, separators=(',', ': '))) + + def _write_plain_text_report_multi(site, config, output_files, total_time): total_urls = len(site.pages) total_errors = len(site.error_pages) From ca0ba0c9d98eca9fd0599f08e8b92917968f16b0 Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Sun, 17 Dec 2017 18:11:20 +0000 Subject: [PATCH 03/13] enable JSON reporter --- pylinkvalidator/models.py | 2 +- pylinkvalidator/reporter.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index 3d7d171..a75930f 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -582,7 +582,7 @@ def _build_parser(self): output_group.add_option( "-f", "--format", dest="format", action="store", - default=FORMAT_PLAIN, choices=[FORMAT_PLAIN], + default=FORMAT_PLAIN, choices=[FORMAT_PLAIN, FORMAT_JSON], help="Format of the report: plain") output_group.add_option( "-o", "--output", dest="output", action="store", diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index f4325dc..4ff6622 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -13,7 +13,7 @@ from pylinkvalidator.compat import StringIO from pylinkvalidator.models import ( - REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_PLAIN) + REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_JSON, FORMAT_PLAIN) PLAIN_TEXT = "text/plain" @@ -55,6 +55,8 @@ def report(site, config, total_time, logger=None): try: if config.options.format == FORMAT_PLAIN: _write_plain_text_report(site, config, output_files, total_time) + if config.options.format == FORMAT_JSON: + _write_json_report(site, config, output_file, total_time) except Exception: if logger: logger.exception("An exception occurred while writing the report") From 142eaa61f8068ef79d659e329753a56e8892bffc Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Sun, 17 Dec 2017 19:11:44 +0000 Subject: [PATCH 04/13] don't truncate origin source this is handy when analysing link sources with external tools like sed, awk --- pylinkvalidator/reporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index 4ff6622..652f432 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -243,7 +243,7 @@ def _print_details(page_iterator, output_files, config, indent=2): source.origin.geturl(), initial_indent), files=output_files) if config.options.show_source: oprint("{1} {0}".format( - truncate(source.origin_str), initial_indent), + source.origin_str, initial_indent), files=output_files) From 0cf603542165107b8fca0bd90fb8cc6e5a30382c Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Sun, 17 Dec 2017 19:13:40 +0000 Subject: [PATCH 05/13] display source target attribute value when using -S --- pylinkvalidator/reporter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index 652f432..5706787 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -239,8 +239,8 @@ def _print_details(page_iterator, output_files, config, indent=2): oprint("{1} {0}".format(content_message, initial_indent), files=output_files) for source in page.sources: - oprint("{1} from {0}".format( - source.origin.geturl(), initial_indent), files=output_files) + oprint("{1} from {0} target={2}".format( + source.origin.geturl(), initial_indent, source.target), files=output_files) if config.options.show_source: oprint("{1} {0}".format( source.origin_str, initial_indent), From bec2e5bc91405a7654ab12c1327dba994dd52816 Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Sun, 17 Dec 2017 19:20:21 +0000 Subject: [PATCH 06/13] include more details in JSON report --- pylinkvalidator/reporter.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index 5706787..4cb9af3 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -117,15 +117,24 @@ def _write_json_report(site, config, output_file, total_time): for results, resource in pages.items(): details = { + 'link': resource.url_split.geturl(), 'fragment': results.fragment, 'hostname': results.hostname, 'netloc': results.netloc, + 'is_local': resource.is_local, + 'is_html': resource.is_html, + 'is_ok': resource.is_ok, + 'is_timeout': resource.is_timeout, + 'process_time': resource.process_time, + 'response_time': resource.response_time, + 'status': resource.status, 'path': results.path, 'port': results.port, 'query': results.query, 'scheme': results.scheme, - "sources": [source.origin_str for source in resource.sources], - "targets": [source.target for source in resource.sources] + 'origins': [source.origin.geturl() for source in resource.sources], + 'sources': [source.origin_str for source in resource.sources], + 'targets': [source.target for source in resource.sources] } res_pages.append(details) From 677adeee552cdb8a0762c3b76de63e74a5ba776a Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Fri, 25 May 2018 23:25:25 +0100 Subject: [PATCH 07/13] ignore report files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index bd7d7c8..f92a2cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ *.pyc +*.json +*.xml pylinkvalidator.egg-info/ dist/ build/ +.idea/ From 44c822cc8cd97270ae71ee752e319845fa5c5194 Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Fri, 25 May 2018 23:26:40 +0100 Subject: [PATCH 08/13] add junit-xml which is used for generating JUNIT reports --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bd3ba7e..3024e4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -beautifulsoup4>=4.2.0 \ No newline at end of file +beautifulsoup4>=4.2.0 +junit-xml>=1.8 From 2f36be1b0247f5089ea64291b6f7b45f4da99799 Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Fri, 25 May 2018 23:27:58 +0100 Subject: [PATCH 09/13] add a heler to print the summary just to the console --- pylinkvalidator/reporter.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index 4cb9af3..66458a4 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -237,6 +237,42 @@ def _write_plain_text_report_single(site, config, output_files, total_time): _print_details(pages.values(), output_files, config) +def print_summary(site, config, total_time, indent=2): + total_urls = len(site.pages) + total_errors = len(site.error_pages) + + if not site.is_ok: + global_status = "ERROR" + error_summary = "with {0} error(s) ".format(total_errors) + else: + global_status = "SUCCESS" + error_summary = "" + + print("{0} Crawled {1} urls {2}in {3:.2f} seconds".format( + global_status, total_urls, error_summary, total_time)) + + pages = {} + + if config.options.report_type == REPORT_TYPE_ERRORS: + pages = site.error_pages + elif config.options.report_type == REPORT_TYPE_ALL: + pages = site.pages + + initial_indent = " " * indent + for page in pages.values(): + print("\n{2}{0}: {1}".format( + page.get_status_message(), page.url_split.geturl(), + initial_indent)) + for content_message in page.get_content_messages(): + print("{1} {0}".format(content_message, initial_indent)) + for source in page.sources: + print("{1} from {0} target={2}".format( + source.origin.geturl(), initial_indent, source.target)) + if config.options.show_source: + print("{1} {0}".format( + source.origin_str, initial_indent)) + + def _print_details(page_iterator, output_files, config, indent=2): initial_indent = " " * indent for page in page_iterator: From df1d3d8062b1cb1dd1d3a4114d6271c19eb5314c Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Fri, 25 May 2018 23:28:51 +0100 Subject: [PATCH 10/13] print summary after generating JSON report --- pylinkvalidator/reporter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index 66458a4..cbbecc2 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -144,6 +144,7 @@ def _write_json_report(site, config, output_file, total_time): } output_file.write( json.dumps(res, sort_keys=True, indent=4, separators=(',', ': '))) + print_summary(site, config, total_time) def _write_plain_text_report_multi(site, config, output_files, total_time): From 39e7528545cb859ff8ebf32572da0a17a396a8b5 Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Fri, 25 May 2018 23:29:19 +0100 Subject: [PATCH 11/13] add JUNIT report writer --- pylinkvalidator/reporter.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index cbbecc2..4597387 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -11,6 +11,8 @@ from email.mime.text import MIMEText +from junit_xml import TestSuite, TestCase + from pylinkvalidator.compat import StringIO from pylinkvalidator.models import ( REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_JSON, FORMAT_PLAIN) @@ -75,6 +77,41 @@ def _write_plain_text_report(site, config, output_files, total_time): _write_plain_text_report_single(site, config, output_files, total_time) +def _write_junit_report(site, config, output_file, total_time): + pages = site.pages + test_cases = [] + + for results, resource in pages.items(): + origins = [source.origin.geturl() for source in resource.sources] + if resource.status == 200: + test_case = TestCase( + name=resource.url_split.geturl(), + classname=results.hostname, + elapsed_sec=resource.response_time, + stdout=resource.status, + status="passed" + ) + else: + stderr_message = "Link found on:\n{}".format("\n".join(origins)) + test_case = TestCase( + name=resource.url_split.geturl(), + classname=results.hostname, + elapsed_sec=resource.response_time, + stderr=stderr_message, + status="failed" + ) + if resource.exception: + message = str(resource.exception) + else: + message = "Expected 200 OK but got {}".format(resource.status) + test_case.add_failure_info( + message=message, failure_type="UnexpectedStatusCode") + test_cases.append(test_case) + test_suite = TestSuite("pylinkvalidator test suite", test_cases) + output_file.write(TestSuite.to_xml_string([test_suite])) + print_summary(site, config, total_time) + + def _write_json_report(site, config, output_file, total_time): start_urls = ",".join((start_url_split.geturl() for start_url_split in site.start_url_splits)) From f236edd0273a0fec2cea6024509b4f0aa6ceaac2 Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Fri, 25 May 2018 23:30:14 +0100 Subject: [PATCH 12/13] enable option to generate a JUNIT report --- pylinkvalidator/models.py | 4 +++- pylinkvalidator/reporter.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index a75930f..31cb262 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -81,6 +81,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): FORMAT_PLAIN = "plain" FORMAT_HTML = "html" FORMAT_JSON = "json" +FORMAT_JUNIT = "junit" WHEN_ALWAYS = "always" @@ -582,7 +583,8 @@ def _build_parser(self): output_group.add_option( "-f", "--format", dest="format", action="store", - default=FORMAT_PLAIN, choices=[FORMAT_PLAIN, FORMAT_JSON], + default=FORMAT_PLAIN, + choices=[FORMAT_PLAIN, FORMAT_JSON, FORMAT_JUNIT], help="Format of the report: plain") output_group.add_option( "-o", "--output", dest="output", action="store", diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index 4597387..9878c68 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -15,7 +15,12 @@ from pylinkvalidator.compat import StringIO from pylinkvalidator.models import ( - REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_JSON, FORMAT_PLAIN) + FORMAT_JSON, + FORMAT_JUNIT, + FORMAT_PLAIN, + REPORT_TYPE_ALL, + REPORT_TYPE_ERRORS, +) PLAIN_TEXT = "text/plain" @@ -59,6 +64,8 @@ def report(site, config, total_time, logger=None): _write_plain_text_report(site, config, output_files, total_time) if config.options.format == FORMAT_JSON: _write_json_report(site, config, output_file, total_time) + if config.options.format == FORMAT_JUNIT: + _write_junit_report(site, config, output_file, total_time) except Exception: if logger: logger.exception("An exception occurred while writing the report") From 4369a6f876dee4582a0128f23fd9787b6350ca57 Mon Sep 17 00:00:00 2001 From: kowalcj0 Date: Fri, 25 May 2018 23:32:08 +0100 Subject: [PATCH 13/13] update help & readme with new formatting options --- README.rst | 2 +- pylinkvalidator/models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index c7a9730..569bce2 100644 --- a/README.rst +++ b/README.rst @@ -171,7 +171,7 @@ usage examples. These options change the output of the crawler. -f FORMAT, --format=FORMAT - Format of the report: plain (default) + Format of the report: plain (default), json, junit -o OUTPUT, --output=OUTPUT Path of the file where the report will be printed. -W WHEN, --when=WHEN diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index 31cb262..e10fe03 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -585,7 +585,7 @@ def _build_parser(self): "-f", "--format", dest="format", action="store", default=FORMAT_PLAIN, choices=[FORMAT_PLAIN, FORMAT_JSON, FORMAT_JUNIT], - help="Format of the report: plain") + help="Format of the report: plain (default), json, junit") output_group.add_option( "-o", "--output", dest="output", action="store", default=None,