Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Application port
APP_PORT=8080

# Page size for the URL status check
URL_STATUS_CHECK_PAGE_SIZE=10

# Outgoing scrape request timeout
OUT_GOING_SCRAPE_REQ_TIMEOUT=30 # in seconds

# Outgoing URL accessibility check timeout
OUT_GOING_URL_ACCESSIBILITY_CHECK_TIMEOUT=10 # in seconds
4 changes: 3 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package main

import (
"fmt"
"log"

"scraper/config"
"scraper/handlers"

"github.com/gin-contrib/cors"
Expand All @@ -17,5 +19,5 @@ func main() {
router.GET("/scrape", handlers.ScrapeHandler)
router.GET("/scrape/:id/:page", handlers.PageHandler)

log.Fatal(router.Run(":8080"))
log.Fatal(router.Run(fmt.Sprintf(":%s", config.GetAppPort())))
}
73 changes: 73 additions & 0 deletions config/configurations.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package config

import (
"fmt"
"os"
"scraper/logger"
"strconv"
)

// Default values for environment variables
const (
defaultAppPort = "8080"
defaultURLCheckPageSize = 10
defaultOutgoingScrapeRequestTimeout = 30
defaultOutgoingAccessibilityCheckTimeout = 10
)

// Configuration variables initialized once
var (
appPort string
urlCheckPageSize int
outgoingScrapeRequestTimeout int
outgoingAccessibilityCheckTimeout int
)

func init() {
// Load environment variables and set defaults if necessary
appPort = getEnv("APP_PORT", defaultAppPort)

urlCheckPageSize = parseEnvAsInt("URL_STATUS_CHECK_PAGE_SIZE", defaultURLCheckPageSize)
outgoingScrapeRequestTimeout = parseEnvAsInt("OUT_GOING_SCRAPE_REQ_TIMEOUT", defaultOutgoingScrapeRequestTimeout)
outgoingAccessibilityCheckTimeout = parseEnvAsInt("OUT_GOING_URL_ACCESSIBILITY_CHECK_TIMEOUT", defaultOutgoingAccessibilityCheckTimeout)
}

// Helper function to get environment variable or return a default
func getEnv(key, defaultValue string) string {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
return value
}

// Helper function to parse environment variable as int or return a default
func parseEnvAsInt(key string, defaultValue int) int {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
parsedValue, err := strconv.Atoi(value)
if err != nil {
logger.Error(fmt.Sprintf("Invalid value for %s: %v", key, err))
return defaultValue
}
return parsedValue
}

// Exported getter functions
func GetAppPort() string {
return appPort
}

func GetURLCheckPageSize() int {
return urlCheckPageSize
}

func GetOutgoingScrapeRequestTimeout() int {
return outgoingScrapeRequestTimeout
}

func GetOutgoingAccessibilityCheckTimeout() int {
return outgoingAccessibilityCheckTimeout
}
3 changes: 0 additions & 3 deletions config/constants.go

This file was deleted.

6 changes: 3 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ services:
build:
context: .
dockerfile: Dockerfile
env_file:
- .env
image: scraper-api-image:latest
hostname: scraper-api
ports:
- "8080:8080"
environment:
- PORT=8080
- "${APP_PORT}:${APP_PORT}"
14 changes: 7 additions & 7 deletions handlers/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func ScrapeHandler(context *gin.Context) {
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification
},
Timeout: 30 * time.Second,
Timeout: time.Duration(config.GetOutgoingScrapeRequestTimeout()) * time.Second,
}

if baseURL == "" {
Expand Down Expand Up @@ -58,10 +58,10 @@ func ScrapeHandler(context *gin.Context) {
// Stored page infomation mapped to the returned request ID.
requestID := storage.StorePageInfo(pageInfo)
// Here we check the status of 10 (config.PageSize) scraped URLs.
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.PageSize, len(pageInfo.URLs)))
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize)
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs)))
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize())

context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.PageSize, len(pageInfo.URLs))))
context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs))))
}

// This handles subsequent pagination requests to check status of URLs.
Expand All @@ -70,7 +70,7 @@ func PageHandler(context *gin.Context) {
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification
},
Timeout: 10 * time.Second,
Timeout: time.Duration(config.GetOutgoingAccessibilityCheckTimeout()) * time.Second,
}
// Request ID is required to fetch infromation from the in-memory storage.
requestID := context.Param("id")
Expand All @@ -91,7 +91,7 @@ func PageHandler(context *gin.Context) {
return
}

start, end := utils.CalculatePageBounds(pageNum, len(pageInfo.URLs), config.PageSize)
start, end := utils.CalculatePageBounds(pageNum, len(pageInfo.URLs), config.GetURLCheckPageSize())
if start >= len(pageInfo.URLs) {
logger.Debug(fmt.Sprintf("Requested page [%d] not found", pageNum))
context.JSON(http.StatusNotFound, utils.BuildErrorResponse("page not found"))
Expand All @@ -100,7 +100,7 @@ func PageHandler(context *gin.Context) {

// Check the URL status for URLs on the given pagination page.
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, start, end)
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize)
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize())

context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, pageNum, totalPages, pageInfo, inaccessibleCount, start, end))
}
21 changes: 14 additions & 7 deletions logger/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,26 @@ import (

// Set custom loggers for each log level
var (
DEBUG = log.New(os.Stdout, "[scraper-DEBUG] ", log.LstdFlags)
INFO = log.New(os.Stdout, "[scraper-INFO] ", log.LstdFlags)
ERROR = log.New(os.Stderr, "[scraper-ERROR] ", log.LstdFlags)
debugLogger = log.New(os.Stdout, "[scraper-DEBUG] ", log.LstdFlags)
infoLogger = log.New(os.Stdout, "[scraper-INFO] ", log.LstdFlags)
errorLogger = log.New(os.Stderr, "[scraper-ERROR] ", log.LstdFlags)
)

func Debug(text string) {
DEBUG.Println(text)
debugLogger.Println(text)
}

func Info(text string) {
INFO.Println(text)
infoLogger.Println(text)
}

func Error(err error) {
ERROR.Println(err)
func Error(err interface{}) {
switch v := err.(type) {
case string:
errorLogger.Println(v)
case error:
errorLogger.Println(v)
default:
errorLogger.Println("Unknown error type")
}
}
18 changes: 18 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,24 @@ An API service to scrape a URL and get a summary.
* We can replace the in-memory storage with a database.
* We can use a messaging technique to pass data changes in real-time to the UI.

## Configurations

You can change following configurations on the `.env` file.

```bash
# Application port
APP_PORT=8080

# Page size for the URL status check
URL_STATUS_CHECK_PAGE_SIZE=10

# Outgoing scrape request timeout
OUT_GOING_SCRAPE_REQ_TIMEOUT=30 # in seconds

# Outgoing URL accessibility check timeout
OUT_GOING_URL_ACCESSIBILITY_CHECK_TIMEOUT=10 # in seconds
```

## How to run using Docker

* Run `docker-compose up --build`
Expand Down
2 changes: 1 addition & 1 deletion utils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *mode
return models.PageResponse{
RequestID: requestID,
Pagination: models.Pagination{
PageSize: config.PageSize,
PageSize: config.GetURLCheckPageSize(),
CurrentPage: pageNum,
TotalPages: totalPages,
PrevPage: prevPage,
Expand Down
Loading