From 9d55b87a47661158f52fcf1d2833e9cc276cad5f Mon Sep 17 00:00:00 2001 From: "Gregor N. Purdy, Sr" Date: Sun, 2 Aug 2015 09:13:52 -0700 Subject: [PATCH 1/2] Treat content as HTML even if it has junk before the start of the HTML --- .../com/gravity/goose/network/HtmlFetcher.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala b/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala index 34ebf44a7..80f29d912 100644 --- a/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala +++ b/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala @@ -221,6 +221,23 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { } } else { + /* + * Sometimes guessContentTypeFromStream() fails because there is junk before the start of the HTML. + * + * This code looks at the first 1k characters to see if there is something that looks like HTML lurking beyond some + * initial junk, before giving up and saying it is not HTML. + */ + + val excerpt = htmlResult.substring(0, 1024).toLowerCase() + + if (excerpt.contains(" Date: Sun, 2 Aug 2015 09:28:24 -0700 Subject: [PATCH 2/2] Make substring() call safe if the content is smaller than 1k, and make the implementation more efficient. --- .../com/gravity/goose/network/HtmlFetcher.scala | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala b/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala index 80f29d912..22c9ed010 100644 --- a/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala +++ b/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala @@ -228,14 +228,20 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { * initial junk, before giving up and saying it is not HTML. */ - val excerpt = htmlResult.substring(0, 1024).toLowerCase() + val excerpt = htmlResult.substring(0, Math.min(htmlResult.length(), 1024)).toLowerCase() - if (excerpt.contains("