diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py index ddd776f..efb49a5 100644 --- a/pylinkvalidator/crawler.py +++ b/pylinkvalidator/crawler.py @@ -240,6 +240,7 @@ def __init__(self, worker_init): self.urlopen = get_url_open() self.request_class = get_url_request() self.logger = worker_init.logger + if not self.logger: # Get a new one! self.logger = get_logger() @@ -247,9 +248,9 @@ def __init__(self, worker_init): # We do this here to allow patching by gevent import socket self.timeout_exception = socket.timeout - self.auth_header = None + if self.worker_config.username and self.worker_config.password: base64string = unicode( base64.encodestring( @@ -280,6 +281,9 @@ def _crawl_page(self, worker_input): url_split_to_crawl = worker_input.url_split try: + if self.worker_config.wait > 0: + time.sleep(self.worker_config.wait) + response = open_url( self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, @@ -375,6 +379,7 @@ def _crawl_page(self, worker_input): site_origin=worker_input.site_origin, missing_content=missing_content, erroneous_content=erroneous_content) + except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl( diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index bb54b50..96ec6e3 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -43,7 +43,6 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): DEFAULT_TYPES = ['a', 'img', 'script', 'link'] - TYPE_ATTRIBUTES = { 'a': 'href', 'img': 'src', @@ -51,22 +50,18 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): 'link': 'href', } - DEFAULT_TIMEOUT = 10 - MODE_THREAD = "thread" MODE_PROCESS = "process" MODE_GREEN = "green" - DEFAULT_WORKERS = { MODE_THREAD: 1, MODE_PROCESS: 1, MODE_GREEN: 1000, } - PARSER_STDLIB = "html.parser" PARSER_LXML = "lxml" PARSER_HTML5 = "html5lib" @@ -80,23 +75,20 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): FORMAT_HTML = "html" FORMAT_JSON = "json" - WHEN_ALWAYS = "always" WHEN_ON_ERROR = "error" - REPORT_TYPE_ERRORS = "errors" REPORT_TYPE_SUMMARY = "summary" REPORT_TYPE_ALL = "all" - VERBOSE_QUIET = "0" VERBOSE_NORMAL = "1" VERBOSE_INFO = "2" - HTML_MIME_TYPE = "text/html" +DEFAULT_WAIT = 0 PAGE_QUEUED = '__PAGE_QUEUED__' PAGE_CRAWLED = '__PAGE_CRAWLED__' @@ -108,32 +100,26 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): "WorkerInit", ["worker_config", "input_queue", "output_queue", "logger"]) - WorkerConfig = namedtuple_with_defaults( "WorkerConfig", ["username", "password", "types", "timeout", "parser", "strict_mode", - "prefer_server_encoding", "extra_headers"]) - + "prefer_server_encoding", "extra_headers", "wait"]) WorkerInput = namedtuple_with_defaults( "WorkerInput", ["url_split", "should_crawl", "depth", "site_origin", "content_check"]) - Response = namedtuple_with_defaults( "Response", ["content", "status", "exception", "original_url", "final_url", "is_redirect", "is_timeout", "response_time"]) - ExceptionStr = namedtuple_with_defaults( "ExceptionStr", ["type_name", "message"]) - Link = namedtuple_with_defaults( "Link", ["type", "url_split", "original_url_split", "source_str"]) - PageCrawl = namedtuple_with_defaults( "PageCrawl", ["original_url_split", "final_url_split", "status", "is_timeout", "is_redirect", "links", @@ -141,15 +127,12 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): "process_time", "site_origin", "missing_content", "erroneous_content"]) - PageStatus = namedtuple_with_defaults( "PageStatus", ["status", "sources"]) - PageSource = namedtuple_with_defaults( "PageSource", ["origin", "origin_str"]) - ContentCheck = namedtuple_with_defaults( "ContentCheck", ["html_presence", "html_absence", "text_presence", "text_absence", @@ -162,6 +145,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): class UTF8Class(object): """Handles unicode string from __unicode__() in: __str__() and __repr__() """ + def __str__(self): return get_safe_str(self.__unicode__()) @@ -202,8 +186,8 @@ def __init__(self): def should_crawl(self, url_split, depth): """Returns True if url split is local AND depth is acceptable""" - return (self.options.depth < 0 or depth < self.options.depth) and\ - self.is_local(url_split) + return (self.options.depth < 0 or depth < self.options.depth) and \ + self.is_local(url_split) def is_local(self, url_split, site_origin=None): """Returns true if url split is in the accepted hosts. site_origin must @@ -307,7 +291,7 @@ def _build_worker_config(self, options): return WorkerConfig( options.username, options.password, types, options.timeout, options.parser, options.strict_mode, - options.prefer_server_encoding, headers) + options.prefer_server_encoding, headers, options.wait) def _build_accepted_hosts(self, options, start_urls): if options.multi: @@ -427,7 +411,7 @@ def _get_prefix_content(self, content, prefix=None): if not prefix: index = content.find(",") prefix = get_clean_url_split(content[:index]) - content = content[index+1:] + content = content[index + 1:] return (prefix, content) @@ -454,14 +438,14 @@ def _build_parser(self): help="fetch resources from other domains without crawling them") crawler_group.add_option( "-H", "--accepted-hosts", - dest="accepted_hosts", action="store", default=None, + dest="accepted_hosts", action="store", default=None, help="comma-separated list of additional hosts to crawl (e.g., " - "example.com,subdomain.another.com)") + "example.com,subdomain.another.com)") crawler_group.add_option( "-i", "--ignore", dest="ignored_prefixes", action="store", default=None, help="comma-separated list of host/path prefixes to ignore " - "(e.g., www.example.com/ignore_this_and_after/)") + "(e.g., www.example.com/ignore_this_and_after/)") crawler_group.add_option( "-u", "--username", dest="username", action="store", default=None, @@ -476,9 +460,9 @@ def _build_parser(self): help="each argument is considered to be a different site") crawler_group.add_option( "-D", "--header", - dest="headers", action="append", metavar="HEADER", + dest="headers", action="append", metavar="HEADER", help="custom header of the form Header: Value " - "(repeat for multiple headers)") + "(repeat for multiple headers)") crawler_group.add_option( "--url-file-path", dest="url_file_path", action="store", default=None, @@ -489,7 +473,7 @@ def _build_parser(self): "-t", "--types", dest="types", action="store", default=",".join(DEFAULT_TYPES), help="Comma-separated values of tags to look for when crawling" - "a site. Default (and supported types): a,img,link,script") + "a site. Default (and supported types): a,img,link,script") crawler_group.add_option( "-T", "--timeout", dest="timeout", type="int", action="store", default=DEFAULT_TIMEOUT, @@ -518,30 +502,30 @@ def _build_parser(self): "--check-presence", dest="content_presence", action="append", help="Check presence of raw or HTML content on all pages. e.g., " - "regex:content. " - "Content can be either regex:pattern or plain content") + "regex:content. " + "Content can be either regex:pattern or plain content") crawler_group.add_option( "--check-absence", dest="content_absence", action="append", help="Check absence of raw or HTML content on all pages. e.g., " - "regex:content. " - "Content can be either regex:pattern or plain content") + "regex:content. " + "Content can be either regex:pattern or plain content") crawler_group.add_option( "--check-presence-once", dest="content_presence_once", action="append", help="Check presence of raw or HTML content for one page: " - "path,content, e.g.,: " - "/path,regex:content. " - "Content can be either regex:pattern or plain content. " - "Path can be either relative or absolute with domain.") + "path,content, e.g.,: " + "/path,regex:content. " + "Content can be either regex:pattern or plain content. " + "Path can be either relative or absolute with domain.") crawler_group.add_option( "--check-absence-once", dest="content_absence_once", action="append", help="Check absence of raw or HTML content for one page: " - "path,content, e.g.," - "path,regex:content. " - "Content can be either regex:pattern or plain content. " - "Path can be either relative or absolute with domain.") + "path,content, e.g.," + "path,regex:content. " + "Content can be either regex:pattern or plain content. " + "Path can be either relative or absolute with domain.") # TODO Add follow redirect option. @@ -565,6 +549,10 @@ def _build_parser(self): help="Types of HTML parse: html.parser (default), lxml, html5lib", default=PARSER_STDLIB, choices=[PARSER_STDLIB, PARSER_LXML, PARSER_HTML5]) + perf_group.add_option( + "--wait", dest="wait", type="int", action="store", default=DEFAULT_WAIT, + help="Number of seconds to wait between each worker request: 0 (default). " + "Combine with --workers to control concurrency. ") parser.add_option_group(perf_group) @@ -584,18 +572,18 @@ def _build_parser(self): "-W", "--when", dest="when", action="store", default=WHEN_ALWAYS, choices=[WHEN_ALWAYS, WHEN_ON_ERROR], help="When to print the report. error (only if a " - "crawling error occurs) or always (default)") + "crawling error occurs) or always (default)") output_group.add_option( "-E", "--report-type", dest="report_type", help="Type of report to print: errors (default, summary and " - "erroneous links), summary, all (summary and all links)", + "erroneous links), summary, all (summary and all links)", action="store", default=REPORT_TYPE_ERRORS, choices=[REPORT_TYPE_ERRORS, REPORT_TYPE_SUMMARY, REPORT_TYPE_ALL]) output_group.add_option( "-c", "--console", dest="console", action="store_true", default=False, help="Prints report to the console in addition to other output" - " options such as file or email.") + " options such as file or email.") crawler_group.add_option( "-S", "--show-source", dest="show_source", action="store_true", default=False, @@ -611,12 +599,12 @@ def _build_parser(self): "-a", "--address", dest="address", action="store", default=None, help="Comma-separated list of email addresses used to send a " - "report") + "report") email_group.add_option( "--from", dest="from_address", action="store", default=None, help="Email address to use in the from field of the email " - "(optional)") + "(optional)") email_group.add_option( "-s", "--smtp", dest="smtp", action="store", default=None, @@ -673,8 +661,8 @@ def __init__(self, url_split, status=200, is_timeout=False, exception=None, self.exception = exception self.is_html = is_html self.is_local = is_local - self.is_ok = status and status < 400 and not missing_content and\ - not erroneous_content + self.is_ok = status and status < 400 and not missing_content and \ + not erroneous_content self.response_time = response_time self.process_time = process_time self.site_origin = site_origin @@ -723,10 +711,10 @@ def get_content_messages(self): """Gets missing and erroneous content """ messages = [ - "missing content: {0}".format(content) for content in - self.missing_content] + [ - "erroneous content: {0}".format(content) for content in - self.erroneous_content] + "missing content: {0}".format(content) for content in + self.missing_content] + [ + "erroneous content: {0}".format(content) for content in + self.erroneous_content] return messages diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py index 8630a11..d66c13f 100644 --- a/pylinkvalidator/tests.py +++ b/pylinkvalidator/tests.py @@ -432,6 +432,15 @@ def test_depth_0(self): self.assertEqual(7, len(site.pages)) self.assertEqual(1, len(site.error_pages)) + def test_wait_1(self): + startCrawl = time.time() + site = self._run_crawler_plain( + ThreadSiteCrawler, ["--wait", "1"], "/depth/root.html") + endCrawl = time.time() + crawlTime = endCrawl - startCrawl + # with 1 second wait crawl time should be equal to or greater than # of pages crawled + self.assertTrue(crawlTime >= len(site.pages)) + def test_strict_mode(self): site = self._run_crawler_plain(ThreadSiteCrawler, ["--strict"])