From d736254a626ea97e18243fa2db96d1bc20a7ff78 Mon Sep 17 00:00:00 2001 From: dushanlk Date: Fri, 3 Jan 2025 20:11:35 +0530 Subject: [PATCH] Refactored error response/added comments --- ...ucytech.jpg => highlevel_arch_diagram.jpg} | Bin handlers/scrape.go | 28 ++++++++++++++---- handlers/scrape_test.go | 2 +- logger/logger.go | 1 + models/entity.go | 2 +- readme.md | 2 +- services/htmlparser.go | 10 +++++++ services/urlstatus.go | 10 +------ storage/memory.go | 8 +++++ utils/helpers.go | 8 +++++ 10 files changed, 54 insertions(+), 17 deletions(-) rename docs/{lucytech.jpg => highlevel_arch_diagram.jpg} (100%) diff --git a/docs/lucytech.jpg b/docs/highlevel_arch_diagram.jpg similarity index 100% rename from docs/lucytech.jpg rename to docs/highlevel_arch_diagram.jpg diff --git a/handlers/scrape.go b/handlers/scrape.go index 9365ec6..26fd6d1 100644 --- a/handlers/scrape.go +++ b/handlers/scrape.go @@ -4,6 +4,7 @@ import ( "crypto/tls" "fmt" "net/http" + "net/url" "strconv" "strings" @@ -14,8 +15,10 @@ import ( "scraper/utils" "github.com/gin-gonic/gin" + "golang.org/x/net/publicsuffix" ) +// This handles the initial scraping request received from the client. func ScrapeHandler(context *gin.Context) { baseURL := context.Query("url") client := &http.Client{ @@ -26,58 +29,73 @@ func ScrapeHandler(context *gin.Context) { if baseURL == "" { logger.Debug("URL query parameter is required") - context.JSON(http.StatusBadRequest, gin.H{"error": "url query parameter is required"}) + context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("url query parameter is required")) return } else { if !strings.HasPrefix(baseURL, "http://") && !strings.HasPrefix(baseURL, "https://") { baseURL = "http://" + baseURL } + + baseUrlParsed, _ := url.Parse(baseURL) + _, err := publicsuffix.EffectiveTLDPlusOne(baseUrlParsed.Host) + if err != nil { + logger.Error(err) + context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("Invalid URL format, please provide a valid URL.")) + return + } } pageInfo, err := services.FetchPageInfo(client, baseURL) if err != nil { logger.Error(err) - context.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch page info"}) + context.JSON(http.StatusInternalServerError, utils.BuildErrorResponse("Failed to fetch page info")) return } + // We store scraped page info in-memory to use with pagination later. + // Stored page infomation mapped to the returned request ID. requestID := storage.StorePageInfo(pageInfo) + // Here we check the status of 10 (config.PageSize) scraped URLs. inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.PageSize, len(pageInfo.URLs))) totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize) context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.PageSize, len(pageInfo.URLs)))) } +// This handles subsequent pagination requests to check status of URLs. func PageHandler(context *gin.Context) { client := &http.Client{ Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification }, } + // Request ID is required to fetch infromation from the in-memory storage. requestID := context.Param("id") pageNumStr := context.Param("page") + // Retrieve page information from in-memory storage using the request ID. pageInfo, exists := storage.RetrievePageInfo(requestID) if !exists { logger.Debug(fmt.Sprintf("Requested ID [%s] not found in the local storage", requestID)) - context.JSON(http.StatusNotFound, gin.H{"error": "request ID not found"}) + context.JSON(http.StatusNotFound, utils.BuildErrorResponse("request ID not found")) return } pageNum, err := strconv.Atoi(pageNumStr) if err != nil || pageNum < 1 { logger.Error(err) - context.JSON(http.StatusBadRequest, gin.H{"error": "invalid page number"}) + context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("invalid page number")) return } start, end := utils.CalculatePageBounds(pageNum, len(pageInfo.URLs), config.PageSize) if start >= len(pageInfo.URLs) { logger.Debug(fmt.Sprintf("Requested page [%d] not found", pageNum)) - context.JSON(http.StatusNotFound, gin.H{"error": "page not found"}) + context.JSON(http.StatusNotFound, utils.BuildErrorResponse("page not found")) return } + // Check the URL status for URLs on the given pagination page. inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, start, end) totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize) diff --git a/handlers/scrape_test.go b/handlers/scrape_test.go index 7fe23ff..cd60e24 100644 --- a/handlers/scrape_test.go +++ b/handlers/scrape_test.go @@ -63,7 +63,7 @@ func TestScrapeHandler(test_type *testing.T) { mockRequestID: "", expectedStatus: http.StatusInternalServerError, expectedBody: map[string]interface{}{ - "error": "failed to fetch page info", + "error": "Failed to fetch page info", }, }, } diff --git a/logger/logger.go b/logger/logger.go index a26a666..c2a4f0c 100644 --- a/logger/logger.go +++ b/logger/logger.go @@ -5,6 +5,7 @@ import ( "os" ) +// Set custom loggers for each log level var ( DEBUG = log.New(os.Stdout, "[scraper-DEBUG] ", log.LstdFlags) INFO = log.New(os.Stdout, "[scraper-INFO] ", log.LstdFlags) diff --git a/models/entity.go b/models/entity.go index 9ab403a..5404834 100644 --- a/models/entity.go +++ b/models/entity.go @@ -13,5 +13,5 @@ type PageInfo struct { type URLStatus struct { URL string `json:"url"` HTTPStatus int `json:"http_status"` - Error error `json:"error"` + Error string `json:"error"` } diff --git a/readme.md b/readme.md index 4c4e533..887fa06 100644 --- a/readme.md +++ b/readme.md @@ -4,7 +4,7 @@ An API service to scrape a URL and get a summary. ## High level architecture -![High level diagram](./docs/lucytech.jpg) +![High level diagram](./docs/highlevel_arch_diagram.jpg) ### Components diff --git a/services/htmlparser.go b/services/htmlparser.go index fa3efa9..2136f46 100644 --- a/services/htmlparser.go +++ b/services/htmlparser.go @@ -12,6 +12,7 @@ import ( "golang.org/x/net/publicsuffix" ) +// This is to fetch the HTML content of the given URL. func FetchPageInfo(client *http.Client, baseURL string) (*models.PageInfo, error) { resp, err := client.Get(baseURL) if err != nil { @@ -23,6 +24,7 @@ func FetchPageInfo(client *http.Client, baseURL string) (*models.PageInfo, error return ParseHTML(resp.Body, baseURL) } +// This is to parse the HTML content and extract required data. func ParseHTML(body io.Reader, baseURL string) (*models.PageInfo, error) { pageInfo := &models.PageInfo{HeadingCounts: make(map[string]int)} doc, err := html.Parse(body) @@ -73,6 +75,7 @@ func traverse(node *html.Node, visit func(*html.Node)) { } } +// This is to extract links from the HTML content. func extractHref(node *html.Node) string { for _, attr := range node.Attr { if attr.Key == "href" { @@ -82,12 +85,15 @@ func extractHref(node *html.Node) string { return "" } +// This is to build the absolute URL from given baseURL and path. func resolveURL(baseURL, href string) string { base, _ := url.Parse(baseURL) rel, _ := url.Parse(href) return base.ResolveReference(rel).String() } +// This is to compare TLDs of found URLs against the scraped page URL +// to determine if found URLs are internal links or external link. func isInternal(baseUrl, scrappedUrl string) bool { baseUrlParsed, _ := url.Parse(baseUrl) scrappedUrlParsed, _ := url.Parse(scrappedUrl) @@ -98,6 +104,8 @@ func isInternal(baseUrl, scrappedUrl string) bool { return strings.EqualFold(baseUrlTld, scrappedUrlTld) } +// This is to check if the scraped HTML content has a password input. +// Based on this we decided if the page contains a login form. func containsPasswordInput(node *html.Node) bool { if node.Type == html.ElementNode && node.Data == "input" { for _, attr := range node.Attr { @@ -114,6 +122,7 @@ func containsPasswordInput(node *html.Node) bool { return false } +// This is to extract the scraped page title from the HTML title tag. func extractTitle(node *html.Node) string { if node.Type == html.ElementNode && node.Data == "title" && node.FirstChild != nil { return node.FirstChild.Data @@ -127,6 +136,7 @@ func extractTitle(node *html.Node) string { return "" } +// This is to extract the HTML version of the scraped page. func extractHtmlVersion(node *html.Node) string { // Check for a "version" attribute for _, attr := range node.Attr { diff --git a/services/urlstatus.go b/services/urlstatus.go index 9f4578b..f34dd49 100644 --- a/services/urlstatus.go +++ b/services/urlstatus.go @@ -10,14 +10,6 @@ import ( // This is to check the URL status and decide wether it is accessible or not. // It marks the status of each collected URL. // Since the URL collection can be huge we check status based on given start and end positions. -// Parameters: -// -// urls - Collected URLs by scrapping a web page. -// start - starting position of the URL set. -// end - Ending position of the URL set. -// -// Returns: -// inaccessible URL count. func CheckURLStatus(client *http.Client, urls []models.URLStatus, start, end int) int { var wg sync.WaitGroup var mu sync.Mutex @@ -37,7 +29,7 @@ func CheckURLStatus(client *http.Client, urls []models.URLStatus, start, end int inaccessibleCount++ mu.Unlock() - urls[idx].Error = err + urls[idx].Error = err.Error() return } diff --git a/storage/memory.go b/storage/memory.go index 83c7444..5d019e2 100644 --- a/storage/memory.go +++ b/storage/memory.go @@ -1,3 +1,7 @@ +// This is a simple in-memory storage to store page info to support for pagination. +// Each store page info is mapped to a random unique ID which generated upon storing data. +// To retrieve stored page info need to provide the ID generator upon storing data. +// This simple storage supports only store and retrieve operations as of now. package storage import ( @@ -12,6 +16,7 @@ var storage = struct { data map[string]models.PageInfo }{data: make(map[string]models.PageInfo)} +// This is to store page info. func StorePageInfo(info *models.PageInfo) string { storage.Lock() defer storage.Unlock() @@ -21,6 +26,7 @@ func StorePageInfo(info *models.PageInfo) string { return id } +// This is to retrieve page info by unique ID. func RetrievePageInfo(id string) (*models.PageInfo, bool) { storage.RLock() defer storage.RUnlock() @@ -29,10 +35,12 @@ func RetrievePageInfo(id string) (*models.PageInfo, bool) { return &info, exists } +// This is to generate the random unique ID. func generateID() string { return time.Now().Format("20060102150405") + "-" + randomString(8) } +// This is to generate a random string to append into the random unique ID. func randomString(size int) string { const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" result := make([]byte, size) diff --git a/utils/helpers.go b/utils/helpers.go index 9f1334f..918dc30 100644 --- a/utils/helpers.go +++ b/utils/helpers.go @@ -5,6 +5,8 @@ import ( "math" "scraper/config" "scraper/models" + + "github.com/gin-gonic/gin" ) func CalculateTotalPages(totalItems, pageSize int) int { @@ -24,6 +26,7 @@ func min(a, b int) int { return b } +// This is to build the response after a successful scraping. func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *models.PageInfo, inaccessible, start, end int) models.PageResponse { var prevPage, nextPage *string if pageNum > 1 { @@ -59,3 +62,8 @@ func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *mode }, } } + +// This is to build the error response. +func BuildErrorResponse(message string) gin.H { + return gin.H{"error": message} +}