diff --git a/.env b/.env new file mode 100644 index 0000000..a41e343 --- /dev/null +++ b/.env @@ -0,0 +1,11 @@ +# Application port +APP_PORT=8080 + +# Page size for the URL status check +URL_STATUS_CHECK_PAGE_SIZE=10 + +# Outgoing scrape request timeout +OUT_GOING_SCRAPE_REQ_TIMEOUT=30 # in seconds + +# Outgoing URL accessibility check timeout +OUT_GOING_URL_ACCESSIBILITY_CHECK_TIMEOUT=10 # in seconds diff --git a/cmd/main.go b/cmd/main.go index e0cff67..aba70c0 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -1,8 +1,10 @@ package main import ( + "fmt" "log" + "scraper/config" "scraper/handlers" "github.com/gin-contrib/cors" @@ -17,5 +19,5 @@ func main() { router.GET("/scrape", handlers.ScrapeHandler) router.GET("/scrape/:id/:page", handlers.PageHandler) - log.Fatal(router.Run(":8080")) + log.Fatal(router.Run(fmt.Sprintf(":%s", config.GetAppPort()))) } diff --git a/config/configurations.go b/config/configurations.go new file mode 100644 index 0000000..630d821 --- /dev/null +++ b/config/configurations.go @@ -0,0 +1,73 @@ +package config + +import ( + "fmt" + "os" + "scraper/logger" + "strconv" +) + +// Default values for environment variables +const ( + defaultAppPort = "8080" + defaultURLCheckPageSize = 10 + defaultOutgoingScrapeRequestTimeout = 30 + defaultOutgoingAccessibilityCheckTimeout = 10 +) + +// Configuration variables initialized once +var ( + appPort string + urlCheckPageSize int + outgoingScrapeRequestTimeout int + outgoingAccessibilityCheckTimeout int +) + +func init() { + // Load environment variables and set defaults if necessary + appPort = getEnv("APP_PORT", defaultAppPort) + + urlCheckPageSize = parseEnvAsInt("URL_STATUS_CHECK_PAGE_SIZE", defaultURLCheckPageSize) + outgoingScrapeRequestTimeout = parseEnvAsInt("OUT_GOING_SCRAPE_REQ_TIMEOUT", defaultOutgoingScrapeRequestTimeout) + outgoingAccessibilityCheckTimeout = parseEnvAsInt("OUT_GOING_URL_ACCESSIBILITY_CHECK_TIMEOUT", defaultOutgoingAccessibilityCheckTimeout) +} + +// Helper function to get environment variable or return a default +func getEnv(key, defaultValue string) string { + value := os.Getenv(key) + if value == "" { + return defaultValue + } + return value +} + +// Helper function to parse environment variable as int or return a default +func parseEnvAsInt(key string, defaultValue int) int { + value := os.Getenv(key) + if value == "" { + return defaultValue + } + parsedValue, err := strconv.Atoi(value) + if err != nil { + logger.Error(fmt.Sprintf("Invalid value for %s: %v", key, err)) + return defaultValue + } + return parsedValue +} + +// Exported getter functions +func GetAppPort() string { + return appPort +} + +func GetURLCheckPageSize() int { + return urlCheckPageSize +} + +func GetOutgoingScrapeRequestTimeout() int { + return outgoingScrapeRequestTimeout +} + +func GetOutgoingAccessibilityCheckTimeout() int { + return outgoingAccessibilityCheckTimeout +} diff --git a/config/constants.go b/config/constants.go deleted file mode 100644 index 1dbade8..0000000 --- a/config/constants.go +++ /dev/null @@ -1,3 +0,0 @@ -package config - -const PageSize = 10 diff --git a/docker-compose.yml b/docker-compose.yml index db66ff6..70e4f47 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,9 +5,9 @@ services: build: context: . dockerfile: Dockerfile + env_file: + - .env image: scraper-api-image:latest hostname: scraper-api ports: - - "8080:8080" - environment: - - PORT=8080 + - "${APP_PORT}:${APP_PORT}" diff --git a/handlers/scrape.go b/handlers/scrape.go index d174449..1c6beca 100644 --- a/handlers/scrape.go +++ b/handlers/scrape.go @@ -26,7 +26,7 @@ func ScrapeHandler(context *gin.Context) { Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification }, - Timeout: 30 * time.Second, + Timeout: time.Duration(config.GetOutgoingScrapeRequestTimeout()) * time.Second, } if baseURL == "" { @@ -58,10 +58,10 @@ func ScrapeHandler(context *gin.Context) { // Stored page infomation mapped to the returned request ID. requestID := storage.StorePageInfo(pageInfo) // Here we check the status of 10 (config.PageSize) scraped URLs. - inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.PageSize, len(pageInfo.URLs))) - totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize) + inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs))) + totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize()) - context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.PageSize, len(pageInfo.URLs)))) + context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs)))) } // This handles subsequent pagination requests to check status of URLs. @@ -70,7 +70,7 @@ func PageHandler(context *gin.Context) { Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification }, - Timeout: 10 * time.Second, + Timeout: time.Duration(config.GetOutgoingAccessibilityCheckTimeout()) * time.Second, } // Request ID is required to fetch infromation from the in-memory storage. requestID := context.Param("id") @@ -91,7 +91,7 @@ func PageHandler(context *gin.Context) { return } - start, end := utils.CalculatePageBounds(pageNum, len(pageInfo.URLs), config.PageSize) + start, end := utils.CalculatePageBounds(pageNum, len(pageInfo.URLs), config.GetURLCheckPageSize()) if start >= len(pageInfo.URLs) { logger.Debug(fmt.Sprintf("Requested page [%d] not found", pageNum)) context.JSON(http.StatusNotFound, utils.BuildErrorResponse("page not found")) @@ -100,7 +100,7 @@ func PageHandler(context *gin.Context) { // Check the URL status for URLs on the given pagination page. inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, start, end) - totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize) + totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize()) context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, pageNum, totalPages, pageInfo, inaccessibleCount, start, end)) } diff --git a/logger/logger.go b/logger/logger.go index c2a4f0c..db2775c 100644 --- a/logger/logger.go +++ b/logger/logger.go @@ -7,19 +7,26 @@ import ( // Set custom loggers for each log level var ( - DEBUG = log.New(os.Stdout, "[scraper-DEBUG] ", log.LstdFlags) - INFO = log.New(os.Stdout, "[scraper-INFO] ", log.LstdFlags) - ERROR = log.New(os.Stderr, "[scraper-ERROR] ", log.LstdFlags) + debugLogger = log.New(os.Stdout, "[scraper-DEBUG] ", log.LstdFlags) + infoLogger = log.New(os.Stdout, "[scraper-INFO] ", log.LstdFlags) + errorLogger = log.New(os.Stderr, "[scraper-ERROR] ", log.LstdFlags) ) func Debug(text string) { - DEBUG.Println(text) + debugLogger.Println(text) } func Info(text string) { - INFO.Println(text) + infoLogger.Println(text) } -func Error(err error) { - ERROR.Println(err) +func Error(err interface{}) { + switch v := err.(type) { + case string: + errorLogger.Println(v) + case error: + errorLogger.Println(v) + default: + errorLogger.Println("Unknown error type") + } } diff --git a/readme.md b/readme.md index 887fa06..bcf003d 100644 --- a/readme.md +++ b/readme.md @@ -23,6 +23,24 @@ An API service to scrape a URL and get a summary. * We can replace the in-memory storage with a database. * We can use a messaging technique to pass data changes in real-time to the UI. +## Configurations + +You can change following configurations on the `.env` file. + +```bash +# Application port +APP_PORT=8080 + +# Page size for the URL status check +URL_STATUS_CHECK_PAGE_SIZE=10 + +# Outgoing scrape request timeout +OUT_GOING_SCRAPE_REQ_TIMEOUT=30 # in seconds + +# Outgoing URL accessibility check timeout +OUT_GOING_URL_ACCESSIBILITY_CHECK_TIMEOUT=10 # in seconds +``` + ## How to run using Docker * Run `docker-compose up --build` diff --git a/utils/helpers.go b/utils/helpers.go index 918dc30..eb22fa1 100644 --- a/utils/helpers.go +++ b/utils/helpers.go @@ -41,7 +41,7 @@ func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *mode return models.PageResponse{ RequestID: requestID, Pagination: models.Pagination{ - PageSize: config.PageSize, + PageSize: config.GetURLCheckPageSize(), CurrentPage: pageNum, TotalPages: totalPages, PrevPage: prevPage,