From b48e942a7b0a3e2828994af2d5a67f3dace0fc6d Mon Sep 17 00:00:00 2001 From: dushanlk Date: Fri, 3 Jan 2025 12:04:12 +0530 Subject: [PATCH] Refactored internal/external URL checker with TLD comparison --- services/htmlparser.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/services/htmlparser.go b/services/htmlparser.go index 5e9c011..fa3efa9 100644 --- a/services/htmlparser.go +++ b/services/htmlparser.go @@ -6,8 +6,10 @@ import ( "net/url" "scraper/logger" "scraper/models" + "strings" "golang.org/x/net/html" + "golang.org/x/net/publicsuffix" ) func FetchPageInfo(client *http.Client, baseURL string) (*models.PageInfo, error) { @@ -86,11 +88,14 @@ func resolveURL(baseURL, href string) string { return base.ResolveReference(rel).String() } -func isInternal(baseURL, fullURL string) bool { - //TODO: Host name comparison might not work. Fix here! - base, _ := url.Parse(baseURL) - full, _ := url.Parse(fullURL) - return base.Host == full.Host +func isInternal(baseUrl, scrappedUrl string) bool { + baseUrlParsed, _ := url.Parse(baseUrl) + scrappedUrlParsed, _ := url.Parse(scrappedUrl) + + baseUrlTld, _ := publicsuffix.EffectiveTLDPlusOne(baseUrlParsed.Host) + scrappedUrlTld, _ := publicsuffix.EffectiveTLDPlusOne(scrappedUrlParsed.Host) + + return strings.EqualFold(baseUrlTld, scrappedUrlTld) } func containsPasswordInput(node *html.Node) bool {