diff --git a/cmd/main.go b/cmd/main.go index aba70c0..fd9ef45 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,7 +17,7 @@ func main() { router.Use(cors.Default()) router.GET("/scrape", handlers.ScrapeHandler) - router.GET("/scrape/:id/:page", handlers.PageHandler) + router.GET("/scrape/:session_id/:id/:page", handlers.PageHandler) log.Fatal(router.Run(fmt.Sprintf(":%s", config.GetAppPort()))) } diff --git a/handlers/scrape.go b/handlers/scrape.go index b81d2fb..1e912dd 100644 --- a/handlers/scrape.go +++ b/handlers/scrape.go @@ -24,6 +24,13 @@ import ( // This handles the initial scraping request received from the client. func ScrapeHandler(context *gin.Context) { baseURL := context.Query("url") + // Session ID is used to map the session to the in-memory storage. + sessionId := context.Query("session_id") + // Generate a session ID if not provided by the client. + if sessionId == "" { + sessionId = storage.GenerateID() + } + client := &http.Client{ Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification @@ -73,15 +80,20 @@ func ScrapeHandler(context *gin.Context) { return } + // Retrieve the database from the in-memory storage using the session ID. + database := storage.RetriveDatabase(sessionId) + // We store scraped page info in-memory to use with pagination later. // Stored page infomation mapped to the returned request ID. - requestID := storage.StorePageInfo(pageInfo) + // requestID := storage.StorePageInfo(pageInfo) + requestId := database.StorePageInfo(pageInfo) + // Here we check the status of 10 (config.PageSize) scraped URLs. inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs))) totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize()) - context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, + context.JSON(http.StatusOK, utils.BuildPageResponse(requestId, sessionId, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs)))) } @@ -93,12 +105,20 @@ func PageHandler(context *gin.Context) { }, Timeout: time.Duration(config.GetOutgoingAccessibilityCheckTimeout()) * time.Second, } + + // Session ID is required to fetch the in-memory storage. + sessionId := context.Param("session_id") // Request ID is required to fetch infromation from the in-memory storage. requestID := context.Param("id") pageNumStr := context.Param("page") + // Retrieve the database from the in-memory storage using the session ID. + database := storage.RetriveDatabase(sessionId) + // Retrieve page information from in-memory storage using the request ID. - pageInfo, exists := storage.RetrievePageInfo(requestID) + // pageInfo, exists := storage.RetrievePageInfo(requestID) + pageInfo, exists := database.RetrievePageInfo(requestID) + if !exists { logger.Debug(fmt.Sprintf("Requested ID [%s] not found in the local storage", requestID)) context.JSON(http.StatusNotFound, utils.BuildErrorResponse("request ID not found")) @@ -123,6 +143,6 @@ func PageHandler(context *gin.Context) { inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, start, end) totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize()) - context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, pageNum, totalPages, pageInfo, - inaccessibleCount, start, end)) + context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, sessionId, pageNum, + totalPages, pageInfo, inaccessibleCount, start, end)) } diff --git a/handlers/scrape_test.go b/handlers/scrape_test.go index 5bf0c8c..2ff45ee 100644 --- a/handlers/scrape_test.go +++ b/handlers/scrape_test.go @@ -5,6 +5,7 @@ import ( "net" "net/http" "net/http/httptest" + "reflect" "scraper/models" "scraper/services" "scraper/storage" @@ -23,6 +24,7 @@ func TestScrapeHandler(test_type *testing.T) { mockPageInfo *models.PageInfo mockError error mockRequestID string + mockSessionID string expectedStatus int expectedBody map[string]interface{} }{ @@ -42,6 +44,7 @@ func TestScrapeHandler(test_type *testing.T) { }, mockError: nil, mockRequestID: "mockRequestID", + mockSessionID: "mockSessionID", expectedStatus: http.StatusOK, }, { @@ -62,6 +65,7 @@ func TestScrapeHandler(test_type *testing.T) { mockPageInfo: nil, mockError: assert.AnError, mockRequestID: "", + mockSessionID: "", expectedStatus: http.StatusInternalServerError, expectedBody: map[string]interface{}{ "error": "An unexpected error occurred", @@ -77,6 +81,7 @@ func TestScrapeHandler(test_type *testing.T) { IsTimeout: true, }, mockRequestID: "", + mockSessionID: "", expectedStatus: http.StatusGatewayTimeout, expectedBody: map[string]interface{}{ "error": "Request timeout during the page fetch", @@ -92,6 +97,7 @@ func TestScrapeHandler(test_type *testing.T) { IsTimeout: false, }, mockRequestID: "", + mockSessionID: "", expectedStatus: http.StatusBadGateway, expectedBody: map[string]interface{}{ "error": "Failed to reach the requested URL", @@ -105,6 +111,7 @@ func TestScrapeHandler(test_type *testing.T) { mockPageInfo: nil, mockError: nil, mockRequestID: "", + mockSessionID: "", expectedStatus: http.StatusBadRequest, expectedBody: map[string]interface{}{ "error": "Invalid URL format, please provide a valid URL.", @@ -121,10 +128,21 @@ func TestScrapeHandler(test_type *testing.T) { }) defer patchFetchPageInfo.Unpatch() - patchStorePageInfo := monkey.Patch(storage.StorePageInfo, - func(info *models.PageInfo) string { - return test_data.mockRequestID + patchRetriveDatabase := monkey.Patch(storage.RetriveDatabase, + func(sessionID string) storage.Database { + return &storage.InMemoryDatabase{Data: make(map[string]models.PageInfo, 10_000)} }) + defer patchRetriveDatabase.Unpatch() + + patchStorePageInfo := monkey.PatchInstanceMethod( + reflect.TypeOf(&storage.InMemoryDatabase{}), // Type of the struct + "StorePageInfo", // Method name to patch + func(db *storage.InMemoryDatabase, info *models.PageInfo) string { + // Mocked implementation + db.Data[test_data.mockRequestID] = *info + return test_data.mockRequestID + }, + ) defer patchStorePageInfo.Unpatch() router := gin.Default() @@ -205,10 +223,19 @@ func TestPageHandler(test_type *testing.T) { for _, test_data := range tests { test_type.Run(test_data.name, func(test_type *testing.T) { - patchRetrievePageInfo := monkey.Patch(storage.RetrievePageInfo, - func(id string) (*models.PageInfo, bool) { - return test_data.mockPageInfo, test_data.mockExists + patchRetriveDatabase := monkey.Patch(storage.RetriveDatabase, + func(sessionID string) storage.Database { + return &storage.InMemoryDatabase{Data: make(map[string]models.PageInfo, 10_000)} }) + defer patchRetriveDatabase.Unpatch() + + patchRetrievePageInfo := monkey.PatchInstanceMethod( + reflect.TypeOf(&storage.InMemoryDatabase{}), // Type of the struct + "RetrievePageInfo", // Method name to patch + func(db *storage.InMemoryDatabase, id string) (*models.PageInfo, bool) { + return test_data.mockPageInfo, test_data.mockExists + }, + ) defer patchRetrievePageInfo.Unpatch() patchCalculatePageBounds := monkey.Patch(utils.CalculatePageBounds, diff --git a/models/response.go b/models/response.go index 03259ac..718a2bc 100644 --- a/models/response.go +++ b/models/response.go @@ -26,6 +26,7 @@ type PaginatedURLs struct { type PageResponse struct { RequestID string `json:"request_id"` + SessionId string `json:"session_id"` Pagination Pagination `json:"pagination"` Scraped ScrapedData `json:"scraped"` } diff --git a/storage/memory.go b/storage/memory.go index 5d019e2..4e267d1 100644 --- a/storage/memory.go +++ b/storage/memory.go @@ -5,38 +5,93 @@ package storage import ( + "fmt" "math/rand" + "scraper/logger" "scraper/models" "sync" "time" ) -var storage = struct { +// Database registry to store dbs per user session +// Here we set the initial capacity to 10000 to reduce resizing overhead. +var dbRegistry = struct { sync.RWMutex - data map[string]models.PageInfo -}{data: make(map[string]models.PageInfo)} + dbs map[string]Database +}{dbs: make(map[string]Database, 10_000)} -// This is to store page info. -func StorePageInfo(info *models.PageInfo) string { - storage.Lock() - defer storage.Unlock() +type Database interface { + StorePageInfo(info *models.PageInfo) string + RetrievePageInfo(id string) (*models.PageInfo, bool) +} + +type InMemoryDatabase struct { + sync.RWMutex + Data map[string]models.PageInfo +} + +// This is to retrieve the database by session ID. +// If the database does not exist, it will create a new one for the current session. +// Here we set the initial capacity to 10000 to reduce resizing overhead. +func RetriveDatabase(sessionID string) Database { + dbRegistry.RLock() + defer dbRegistry.RUnlock() + + db, exists := dbRegistry.dbs[sessionID] + if !exists { + db = &InMemoryDatabase{Data: make(map[string]models.PageInfo, 10_000)} + dbRegistry.dbs[sessionID] = db + } + return db +} + +// This is to store page info in the given database. +func (db *InMemoryDatabase) StorePageInfo(info *models.PageInfo) string { + db.Lock() + defer db.Unlock() - id := generateID() - storage.data[id] = *info + id := GenerateID() + db.Data[id] = *info + logger.Debug(fmt.Sprintf("Updated database with page info:\n %v", db.Data)) return id } -// This is to retrieve page info by unique ID. -func RetrievePageInfo(id string) (*models.PageInfo, bool) { - storage.RLock() - defer storage.RUnlock() +// This is to retrieve page info by unique ID from the given database. +func (db *InMemoryDatabase) RetrievePageInfo(id string) (*models.PageInfo, bool) { + db.RLock() + defer db.RUnlock() - info, exists := storage.data[id] + info, exists := db.Data[id] + logger.Debug(fmt.Sprintf("Retrieved page info from:\n %v", db.Data)) return &info, exists } +// var storage = struct { +// sync.RWMutex +// data map[string]models.PageInfo +// }{data: make(map[string]models.PageInfo)} + +// // This is to store page info. +// func StorePageInfo(info *models.PageInfo) string { +// storage.Lock() +// defer storage.Unlock() + +// id := generateID() +// storage.data[id] = *info +// return id +// } + +// // This is to retrieve page info by unique ID. +// func RetrievePageInfo(id string) (*models.PageInfo, bool) { +// storage.RLock() +// defer storage.RUnlock() + +// info, exists := storage.data[id] +// return &info, exists +// } + // This is to generate the random unique ID. -func generateID() string { +func GenerateID() string { return time.Now().Format("20060102150405") + "-" + randomString(8) } diff --git a/storage/memory_test.go b/storage/memory_test.go index fdd2718..b603eac 100644 --- a/storage/memory_test.go +++ b/storage/memory_test.go @@ -12,12 +12,13 @@ func TestStorePageInfo(test_type *testing.T) { Title: "Test Page", } - id := StorePageInfo(pageInfo) + database := RetriveDatabase("test-session") + id := database.StorePageInfo(pageInfo) // Ensure the ID is not empty assert.NotEmpty(test_type, id, "Generated ID should not be empty") - retrievedInfo, exists := RetrievePageInfo(id) + retrievedInfo, exists := database.RetrievePageInfo(id) // Assert that the PageInfo exists assert.True(test_type, exists, "Stored PageInfo should be retrievable") @@ -26,8 +27,9 @@ func TestStorePageInfo(test_type *testing.T) { } func TestRetrievePageInfo_NotFound(test_type *testing.T) { + database := RetriveDatabase("test-session") // Try retrieving a non-existent PageInfo - retrievedInfo, exists := RetrievePageInfo("nonexistent-id") + retrievedInfo, exists := database.RetrievePageInfo("nonexistent-id") // Assert that the info does not exist assert.False(test_type, exists, "Non-existent ID should not be found") @@ -39,7 +41,8 @@ func TestRetrievePageInfo_NotFound(test_type *testing.T) { func TestGenerateID(test_type *testing.T) { // Call the private function indirectly by calling StorePageInfo pageInfo := &models.PageInfo{Title: "Test Page"} - id := StorePageInfo(pageInfo) + database := RetriveDatabase("test-session") + id := database.StorePageInfo(pageInfo) // Assert that the ID follows the expected format assert.Regexp(test_type, `^\d{14}-[a-zA-Z0-9]{8}$`, id, diff --git a/utils/helpers.go b/utils/helpers.go index 84fc358..b70fdea 100644 --- a/utils/helpers.go +++ b/utils/helpers.go @@ -27,20 +27,21 @@ func min(a, b int) int { } // This is to build the response after a successful scraping. -func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *models.PageInfo, - inaccessible, start, end int) models.PageResponse { +func BuildPageResponse(requestID string, sessionID string, pageNum, totalPages int, + pageInfo *models.PageInfo, inaccessible, start, end int) models.PageResponse { var prevPage, nextPage *string if pageNum > 1 { - prev := fmt.Sprintf("/scrape/%s/%d", requestID, pageNum-1) + prev := fmt.Sprintf("/scrape/%s/%s/%d", sessionID, requestID, pageNum-1) prevPage = &prev } if end < len(pageInfo.URLs) { - next := fmt.Sprintf("/scrape/%s/%d", requestID, pageNum+1) + next := fmt.Sprintf("/scrape/%s/%s/%d", sessionID, requestID, pageNum+1) nextPage = &next } return models.PageResponse{ RequestID: requestID, + SessionId: sessionID, Pagination: models.Pagination{ PageSize: config.GetURLCheckPageSize(), CurrentPage: pageNum,