From 8838f8c59fe3a11bf81a491aa86b51b81d117eac Mon Sep 17 00:00:00 2001
From: Jim Priest <jpriest@redhat.com>
Date: Tue, 1 Sep 2015 14:44:03 -0400
Subject: [PATCH 1/2] Add excluded_urls option

URLs matching the regular expression will be ignored
---
 README.rst                |  3 +++
 pylinkvalidator/models.py | 21 ++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 12965f8..5d1b776 100644
--- a/README.rst
+++ b/README.rst
@@ -102,6 +102,9 @@ usage examples.
       -H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS
                           Comma-separated list of additional hosts to crawl
                           (e.g., example.com,subdomain.another.com)
+      -x EXCLUDED_URLS, --exclude=EXCLUDED_URLS
+                          URLs matching the regular expression will be ignored
+                          (e.g., /private/ )
       -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
                           Comma-separated list of host/path prefixes to ignore
                           (e.g., www.example.com/ignore_this_and_after/)
diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py
index 54e33a9..3210f39 100644
--- a/pylinkvalidator/models.py
+++ b/pylinkvalidator/models.py
@@ -12,7 +12,7 @@
 from optparse import OptionParser, OptionGroup
 
 from pylinkvalidator.compat import get_safe_str
-from pylinkvalidator.urlutil import get_clean_url_split
+from pylinkvalidator.urlutil import get_clean_url_split, re
 
 
 DEFAULT_TYPES = ['a', 'img', 'script', 'link']
@@ -148,6 +148,7 @@ def __init__(self):
         self.worker_config = None
         self.accepted_hosts = []
         self.ignored_prefixes = []
+        self.excluded_urls = []
         self.worker_size = 0
 
     def should_crawl(self, url_split, depth):
@@ -160,8 +161,10 @@ def is_local(self, url_split):
         return url_split.netloc in self.accepted_hosts
 
     def should_download(self, url_split):
-        """Returns True if the url does not start with an ignored prefix and if
-        it is local or outside links are allowed."""
+        """Returns True if the url does not start with 
+          * an ignored prefix
+          * it does not match excluded url regex
+          * if it is local or outside links are allowed."""
         local = self.is_local(url_split)
 
         if not self.options.test_outside and not local:
@@ -169,6 +172,10 @@ def should_download(self, url_split):
 
         url = url_split.geturl()
 
+        for exclude_url in self.excluded_urls:
+            if re.search(exclude_url, url):
+                return False
+
         for ignored_prefix in self.ignored_prefixes:
             if url.startswith(ignored_prefix):
                 return False
@@ -207,6 +214,9 @@ def _parse_config(self):
         if self.options.ignored_prefixes:
             self.ignored_prefixes = self.options.ignored_prefixes.split(',')
 
+        if self.options.excluded_urls:
+            self.excluded_urls = self.options.excluded_urls.split(',')
+
         if self.options.workers:
             self.worker_size = self.options.workers
         else:
@@ -274,6 +284,11 @@ def _build_parser(self):
             dest="accepted_hosts",  action="store", default=None,
             help="comma-separated list of additional hosts to crawl (e.g., "
             "example.com,subdomain.another.com)")
+        crawler_group.add_option(
+            "-x", "--exclude", dest="excluded_urls",
+            action="store", default=None,
+            help="URLs matching the regular expression will be ignored"
+            "(e.g., /private/)")
         crawler_group.add_option(
             "-i", "--ignore", dest="ignored_prefixes",
             action="store", default=None,

From 86a86795d701dca7a8515c4a9013e17ec9b62885 Mon Sep 17 00:00:00 2001
From: Jim Priest <jpriest@redhat.com>
Date: Wed, 2 Sep 2015 09:23:42 -0400
Subject: [PATCH 2/2] Add excluded urls option

* refactor from pull request comments to compile regex
* add exclude test
---
 pylinkvalidator/models.py | 5 +++--
 pylinkvalidator/tests.py  | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py
index 3210f39..db32dd5 100644
--- a/pylinkvalidator/models.py
+++ b/pylinkvalidator/models.py
@@ -173,7 +173,7 @@ def should_download(self, url_split):
         url = url_split.geturl()
 
         for exclude_url in self.excluded_urls:
-            if re.search(exclude_url, url):
+            if exclude_url.search(url):
                 return False
 
         for ignored_prefix in self.ignored_prefixes:
@@ -215,7 +215,8 @@ def _parse_config(self):
             self.ignored_prefixes = self.options.ignored_prefixes.split(',')
 
         if self.options.excluded_urls:
-            self.excluded_urls = self.options.excluded_urls.split(',')
+            self.excluded_urls = [re.compile(pattern) for pattern in self.options.excluded_urls.split(',')]
+
 
         if self.options.workers:
             self.worker_size = self.options.workers
diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py
index 945d3f0..6435596 100644
--- a/pylinkvalidator/tests.py
+++ b/pylinkvalidator/tests.py
@@ -331,6 +331,13 @@ def test_run_once(self):
         self.assertEqual(8, len(site.pages))
         self.assertEqual(0, len(site.error_pages))
 
+    def test_exclude(self):
+        site = self._run_crawler_plain(ThreadSiteCrawler, ["--exclude=/sub/"])
+
+        # exclude /sub/ directory = 4 pages linked on the index
+        self.assertEqual(4, len(site.pages))
+        self.assertEqual(0, len(site.error_pages))
+
     def test_depth_0(self):
         site = self._run_crawler_plain(
             ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html")