Skip to content
This repository was archived by the owner on Oct 30, 2018. It is now read-only.
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions src/main/scala/com/gravity/goose/network/HtmlFetcher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,29 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging {
}
}
else {
/*
* Sometimes guessContentTypeFromStream() fails because there is junk before the start of the HTML.
*
* This code looks at the first 1k characters to see if there is something that looks like HTML lurking beyond some
* initial junk, before giving up and saying it is not HTML.
*/

val excerpt = htmlResult.substring(0, Math.min(htmlResult.length(), 1024)).toLowerCase()

val idx1 = excerpt.indexOf("<!doctype html")
if (idx1 != -1) {
return Some(htmlResult.substring(idx1))
}

val idx2 = excerpt.indexOf("<html")
if (idx2 != -1) {
val temp = htmlResult.substring(idx2)

if (temp.contains("<body")) {
return Some(temp)
}
}

throw new NotHtmlException(cleanUrl)
}
}
Expand Down