Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,16 @@ jobs:
- name: Build Docker image
run: |
docker build -t go-scraper-api .

- name: Run golangci-lint
run: |
./golangci-lint run

- name: Report golangci-lint output
run: |
if [ $? -ne 0 ]; then
echo "golangci-lint found issues [Max allowed line length: 100]:"
./golangci-lint run
else
echo "golangci-lint found no issues. All good!"
fi
18 changes: 18 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
run:
timeout: 2m

linters:
enable:
- govet
- errcheck
- staticcheck
- unused
- gocyclo
- gofmt
- lll
disable:
- funlen

linters-settings:
lll:
line-length: 100 # Set max line length
6 changes: 4 additions & 2 deletions config/configurations.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ func init() {
appPort = getEnv("APP_PORT", defaultAppPort)

urlCheckPageSize = parseEnvAsInt("URL_STATUS_CHECK_PAGE_SIZE", defaultURLCheckPageSize)
outgoingScrapeRequestTimeout = parseEnvAsInt("OUT_GOING_SCRAPE_REQ_TIMEOUT", defaultOutgoingScrapeRequestTimeout)
outgoingAccessibilityCheckTimeout = parseEnvAsInt("OUT_GOING_URL_ACCESSIBILITY_CHECK_TIMEOUT", defaultOutgoingAccessibilityCheckTimeout)
outgoingScrapeRequestTimeout = parseEnvAsInt("OUT_GOING_SCRAPE_REQ_TIMEOUT",
defaultOutgoingScrapeRequestTimeout)
outgoingAccessibilityCheckTimeout = parseEnvAsInt("OUT_GOING_URL_ACCESSIBILITY_CHECK_TIMEOUT",
defaultOutgoingAccessibilityCheckTimeout)
}

// Helper function to get environment variable or return a default
Expand Down
Binary file added golangci-lint
Binary file not shown.
18 changes: 12 additions & 6 deletions handlers/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ func ScrapeHandler(context *gin.Context) {

if baseURL == "" {
logger.Debug("URL query parameter is required")
context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("url query parameter is required"))
context.JSON(http.StatusBadRequest,
utils.BuildErrorResponse("url query parameter is required"))
return
} else {
if !strings.HasPrefix(baseURL, "http://") && !strings.HasPrefix(baseURL, "https://") {
Expand All @@ -42,26 +43,30 @@ func ScrapeHandler(context *gin.Context) {
_, err := publicsuffix.EffectiveTLDPlusOne(baseUrlParsed.Host)
if err != nil {
logger.Error(err)
context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("Invalid URL format, please provide a valid URL."))
context.JSON(http.StatusBadRequest,
utils.BuildErrorResponse("Invalid URL format, please provide a valid URL."))
return
}
}

pageInfo, err := services.FetchPageInfo(client, baseURL)
if err != nil {
logger.Error(err)
context.JSON(http.StatusInternalServerError, utils.BuildErrorResponse("Failed to fetch page info"))
context.JSON(http.StatusInternalServerError,
utils.BuildErrorResponse("Failed to fetch page info"))
return
}

// We store scraped page info in-memory to use with pagination later.
// Stored page infomation mapped to the returned request ID.
requestID := storage.StorePageInfo(pageInfo)
// Here we check the status of 10 (config.PageSize) scraped URLs.
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs)))
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0,
min(config.GetURLCheckPageSize(), len(pageInfo.URLs)))
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize())

context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs))))
context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo,
inaccessibleCount, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs))))
}

// This handles subsequent pagination requests to check status of URLs.
Expand Down Expand Up @@ -102,5 +107,6 @@ func PageHandler(context *gin.Context) {
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, start, end)
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize())

context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, pageNum, totalPages, pageInfo, inaccessibleCount, start, end))
context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, pageNum, totalPages, pageInfo,
inaccessibleCount, start, end))
}
28 changes: 16 additions & 12 deletions handlers/scrape_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,16 @@ func TestScrapeHandler(test_type *testing.T) {
for _, test_data := range tests {
test_type.Run(test_data.name, func(test_type *testing.T) {

patchFetchPageInfo := monkey.Patch(services.FetchPageInfo, func(client *http.Client, url string) (*models.PageInfo, error) {
return test_data.mockPageInfo, test_data.mockError
})
patchFetchPageInfo := monkey.Patch(services.FetchPageInfo,
func(client *http.Client, url string) (*models.PageInfo, error) {
return test_data.mockPageInfo, test_data.mockError
})
defer patchFetchPageInfo.Unpatch()

patchStorePageInfo := monkey.Patch(storage.StorePageInfo, func(info *models.PageInfo) string {
return test_data.mockRequestID
})
patchStorePageInfo := monkey.Patch(storage.StorePageInfo,
func(info *models.PageInfo) string {
return test_data.mockRequestID
})
defer patchStorePageInfo.Unpatch()

router := gin.Default()
Expand Down Expand Up @@ -159,14 +161,16 @@ func TestPageHandler(test_type *testing.T) {
for _, test_data := range tests {
test_type.Run(test_data.name, func(test_type *testing.T) {

patchRetrievePageInfo := monkey.Patch(storage.RetrievePageInfo, func(id string) (*models.PageInfo, bool) {
return test_data.mockPageInfo, test_data.mockExists
})
patchRetrievePageInfo := monkey.Patch(storage.RetrievePageInfo,
func(id string) (*models.PageInfo, bool) {
return test_data.mockPageInfo, test_data.mockExists
})
defer patchRetrievePageInfo.Unpatch()

patchCalculatePageBounds := monkey.Patch(utils.CalculatePageBounds, func(pageNum, totalItems, pageSize int) (int, int) {
return 0, 1
})
patchCalculatePageBounds := monkey.Patch(utils.CalculatePageBounds,
func(pageNum, totalItems, pageSize int) (int, int) {
return 0, 1
})
defer patchCalculatePageBounds.Unpatch()

router := gin.Default()
Expand Down
6 changes: 6 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ total: (statements) 86.6%

```

## How to run go lang cli lint

1. Make sure you have the `GOPATH` env variable pointed to `golang/go` path.
2. Run `./golangci-lint run` command.
* Maximum allowed line length is 100 chars.

## API Documentation

#### Request
Expand Down
3 changes: 2 additions & 1 deletion services/htmlparser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ func TestFetchPageInfo(test_type *testing.T) {
test_type.Run(test_data.name, func(test_type *testing.T) {

if test_data.mockError != nil {
httpmock.RegisterResponder("GET", test_data.mockURL, httpmock.NewErrorResponder(test_data.mockError))
httpmock.RegisterResponder("GET", test_data.mockURL,
httpmock.NewErrorResponder(test_data.mockError))
} else {
httpmock.RegisterResponder("GET", test_data.mockURL,
httpmock.NewStringResponder(test_data.mockStatus, test_data.mockBody))
Expand Down
9 changes: 6 additions & 3 deletions services/urlstatus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,16 @@ func TestCheckURLStatus(test_type *testing.T) {
}

// Mock response for valid URL
httpmock.RegisterResponder("GET", "http://example.com/valid", httpmock.NewStringResponder(200, "OK"))
httpmock.RegisterResponder("GET", "http://example.com/valid",
httpmock.NewStringResponder(200, "OK"))

// Mock response for invalid URL (non-2xx status)
httpmock.RegisterResponder("GET", "http://example.com/invalid", httpmock.NewStringResponder(404, "Not Found"))
httpmock.RegisterResponder("GET", "http://example.com/invalid",
httpmock.NewStringResponder(404, "Not Found"))

// Mock response for error (e.g., network failure)
httpmock.RegisterResponder("GET", "http://example.com/error", httpmock.NewErrorResponder(fmt.Errorf("network error")))
httpmock.RegisterResponder("GET", "http://example.com/error",
httpmock.NewErrorResponder(fmt.Errorf("network error")))

inaccessibleCount := CheckURLStatus(client, urls, 0, len(urls))

Expand Down
10 changes: 7 additions & 3 deletions storage/memory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ func TestRetrievePageInfo_NotFound(test_type *testing.T) {
// Assert that the info does not exist
assert.False(test_type, exists, "Non-existent ID should not be found")
// Assert that the returned PageInfo is empty
assert.Equal(test_type, &models.PageInfo{}, retrievedInfo, "Retrieved info should be an empty PageInfo for non-existent ID")
assert.Equal(test_type, &models.PageInfo{}, retrievedInfo,
"Retrieved info should be an empty PageInfo for non-existent ID")
}

func TestGenerateID(test_type *testing.T) {
Expand All @@ -41,7 +42,8 @@ func TestGenerateID(test_type *testing.T) {
id := StorePageInfo(pageInfo)

// Assert that the ID follows the expected format
assert.Regexp(test_type, `^\d{14}-[a-zA-Z0-9]{8}$`, id, "Generated ID should follow the correct format")
assert.Regexp(test_type, `^\d{14}-[a-zA-Z0-9]{8}$`, id,
"Generated ID should follow the correct format")
}

func TestRandomString(test_type *testing.T) {
Expand All @@ -52,6 +54,8 @@ func TestRandomString(test_type *testing.T) {

// Check that the string only contains valid characters
for _, char := range randomStr {
assert.Contains(test_type, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", string(char), "Random string should only contain valid characters")
assert.Contains(test_type,
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", string(char),
"Random string should only contain valid characters")
}
}
3 changes: 2 additions & 1 deletion utils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ func min(a, b int) int {
}

// This is to build the response after a successful scraping.
func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *models.PageInfo, inaccessible, start, end int) models.PageResponse {
func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *models.PageInfo,
inaccessible, start, end int) models.PageResponse {
var prevPage, nextPage *string
if pageNum > 1 {
prev := fmt.Sprintf("/scrape/%s/%d", requestID, pageNum-1)
Expand Down
Loading