diff --git a/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala b/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala index 34ebf44a7..22c9ed010 100644 --- a/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala +++ b/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala @@ -221,6 +221,29 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { } } else { + /* + * Sometimes guessContentTypeFromStream() fails because there is junk before the start of the HTML. + * + * This code looks at the first 1k characters to see if there is something that looks like HTML lurking beyond some + * initial junk, before giving up and saying it is not HTML. + */ + + val excerpt = htmlResult.substring(0, Math.min(htmlResult.length(), 1024)).toLowerCase() + + val idx1 = excerpt.indexOf("