From 8838f8c59fe3a11bf81a491aa86b51b81d117eac Mon Sep 17 00:00:00 2001 From: Jim Priest Date: Tue, 1 Sep 2015 14:44:03 -0400 Subject: [PATCH 1/2] Add excluded_urls option URLs matching the regular expression will be ignored --- README.rst | 3 +++ pylinkvalidator/models.py | 21 ++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 12965f8..5d1b776 100644 --- a/README.rst +++ b/README.rst @@ -102,6 +102,9 @@ usage examples. -H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS Comma-separated list of additional hosts to crawl (e.g., example.com,subdomain.another.com) + -x EXCLUDED_URLS, --exclude=EXCLUDED_URLS + URLs matching the regular expression will be ignored + (e.g., /private/ ) -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES Comma-separated list of host/path prefixes to ignore (e.g., www.example.com/ignore_this_and_after/) diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index 54e33a9..3210f39 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -12,7 +12,7 @@ from optparse import OptionParser, OptionGroup from pylinkvalidator.compat import get_safe_str -from pylinkvalidator.urlutil import get_clean_url_split +from pylinkvalidator.urlutil import get_clean_url_split, re DEFAULT_TYPES = ['a', 'img', 'script', 'link'] @@ -148,6 +148,7 @@ def __init__(self): self.worker_config = None self.accepted_hosts = [] self.ignored_prefixes = [] + self.excluded_urls = [] self.worker_size = 0 def should_crawl(self, url_split, depth): @@ -160,8 +161,10 @@ def is_local(self, url_split): return url_split.netloc in self.accepted_hosts def should_download(self, url_split): - """Returns True if the url does not start with an ignored prefix and if - it is local or outside links are allowed.""" + """Returns True if the url does not start with + * an ignored prefix + * it does not match excluded url regex + * if it is local or outside links are allowed.""" local = self.is_local(url_split) if not self.options.test_outside and not local: @@ -169,6 +172,10 @@ def should_download(self, url_split): url = url_split.geturl() + for exclude_url in self.excluded_urls: + if re.search(exclude_url, url): + return False + for ignored_prefix in self.ignored_prefixes: if url.startswith(ignored_prefix): return False @@ -207,6 +214,9 @@ def _parse_config(self): if self.options.ignored_prefixes: self.ignored_prefixes = self.options.ignored_prefixes.split(',') + if self.options.excluded_urls: + self.excluded_urls = self.options.excluded_urls.split(',') + if self.options.workers: self.worker_size = self.options.workers else: @@ -274,6 +284,11 @@ def _build_parser(self): dest="accepted_hosts", action="store", default=None, help="comma-separated list of additional hosts to crawl (e.g., " "example.com,subdomain.another.com)") + crawler_group.add_option( + "-x", "--exclude", dest="excluded_urls", + action="store", default=None, + help="URLs matching the regular expression will be ignored" + "(e.g., /private/)") crawler_group.add_option( "-i", "--ignore", dest="ignored_prefixes", action="store", default=None, From 86a86795d701dca7a8515c4a9013e17ec9b62885 Mon Sep 17 00:00:00 2001 From: Jim Priest Date: Wed, 2 Sep 2015 09:23:42 -0400 Subject: [PATCH 2/2] Add excluded urls option * refactor from pull request comments to compile regex * add exclude test --- pylinkvalidator/models.py | 5 +++-- pylinkvalidator/tests.py | 7 +++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index 3210f39..db32dd5 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -173,7 +173,7 @@ def should_download(self, url_split): url = url_split.geturl() for exclude_url in self.excluded_urls: - if re.search(exclude_url, url): + if exclude_url.search(url): return False for ignored_prefix in self.ignored_prefixes: @@ -215,7 +215,8 @@ def _parse_config(self): self.ignored_prefixes = self.options.ignored_prefixes.split(',') if self.options.excluded_urls: - self.excluded_urls = self.options.excluded_urls.split(',') + self.excluded_urls = [re.compile(pattern) for pattern in self.options.excluded_urls.split(',')] + if self.options.workers: self.worker_size = self.options.workers diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py index 945d3f0..6435596 100644 --- a/pylinkvalidator/tests.py +++ b/pylinkvalidator/tests.py @@ -331,6 +331,13 @@ def test_run_once(self): self.assertEqual(8, len(site.pages)) self.assertEqual(0, len(site.error_pages)) + def test_exclude(self): + site = self._run_crawler_plain(ThreadSiteCrawler, ["--exclude=/sub/"]) + + # exclude /sub/ directory = 4 pages linked on the index + self.assertEqual(4, len(site.pages)) + self.assertEqual(0, len(site.error_pages)) + def test_depth_0(self): site = self._run_crawler_plain( ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html")