From 834723beda7859252a1eca2a8eaaa2a54dcd072c Mon Sep 17 00:00:00 2001 From: rastasheep Date: Sun, 25 Oct 2015 20:47:38 +0100 Subject: [PATCH] Fallback to 'http' as default url schema if needed --- goose/__init__.py | 6 ++++++ goose/configuration.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/goose/__init__.py b/goose/__init__.py index 409b5732..5565f978 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -22,6 +22,7 @@ """ import os import platform +import urllib2 from tempfile import mkstemp from goose.version import version_info, __version__ @@ -52,6 +53,11 @@ def extract(self, url=None, raw_html=None): Main method to extract an article object from a URL, pass in a url and get back a Article """ + scheme, address = urllib2.splittype(url) + + if not scheme: + url = self.config.default_scheme + url + cc = CrawlCandidate(self.config, url, raw_html) return self.crawl(cc) diff --git a/goose/configuration.py b/goose/configuration.py index fcfa5b9a..827a3476 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -99,6 +99,10 @@ def __init__(self): # http timeout self.http_timeout = HTTP_DEFAULT_TIMEOUT + # default url scheme + # it will be use as fallback if url doesnt conain one + self.default_scheme = 'http://' + def get_parser(self): return AVAILABLE_PARSERS[self.parser_class]