Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func main() {
router.Use(cors.Default())

router.GET("/scrape", handlers.ScrapeHandler)
router.GET("/scrape/:id/:page", handlers.PageHandler)
router.GET("/scrape/:session_id/:id/:page", handlers.PageHandler)

log.Fatal(router.Run(fmt.Sprintf(":%s", config.GetAppPort())))
}
30 changes: 25 additions & 5 deletions handlers/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ import (
// This handles the initial scraping request received from the client.
func ScrapeHandler(context *gin.Context) {
baseURL := context.Query("url")
// Session ID is used to map the session to the in-memory storage.
sessionId := context.Query("session_id")
// Generate a session ID if not provided by the client.
if sessionId == "" {
sessionId = storage.GenerateID()
}

client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification
Expand Down Expand Up @@ -73,15 +80,20 @@ func ScrapeHandler(context *gin.Context) {
return
}

// Retrieve the database from the in-memory storage using the session ID.
database := storage.RetriveDatabase(sessionId)

// We store scraped page info in-memory to use with pagination later.
// Stored page infomation mapped to the returned request ID.
requestID := storage.StorePageInfo(pageInfo)
// requestID := storage.StorePageInfo(pageInfo)
requestId := database.StorePageInfo(pageInfo)

// Here we check the status of 10 (config.PageSize) scraped URLs.
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0,
min(config.GetURLCheckPageSize(), len(pageInfo.URLs)))
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize())

context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo,
context.JSON(http.StatusOK, utils.BuildPageResponse(requestId, sessionId, 1, totalPages, pageInfo,
inaccessibleCount, 0, min(config.GetURLCheckPageSize(), len(pageInfo.URLs))))
}

Expand All @@ -93,12 +105,20 @@ func PageHandler(context *gin.Context) {
},
Timeout: time.Duration(config.GetOutgoingAccessibilityCheckTimeout()) * time.Second,
}

// Session ID is required to fetch the in-memory storage.
sessionId := context.Param("session_id")
// Request ID is required to fetch infromation from the in-memory storage.
requestID := context.Param("id")
pageNumStr := context.Param("page")

// Retrieve the database from the in-memory storage using the session ID.
database := storage.RetriveDatabase(sessionId)

// Retrieve page information from in-memory storage using the request ID.
pageInfo, exists := storage.RetrievePageInfo(requestID)
// pageInfo, exists := storage.RetrievePageInfo(requestID)
pageInfo, exists := database.RetrievePageInfo(requestID)

if !exists {
logger.Debug(fmt.Sprintf("Requested ID [%s] not found in the local storage", requestID))
context.JSON(http.StatusNotFound, utils.BuildErrorResponse("request ID not found"))
Expand All @@ -123,6 +143,6 @@ func PageHandler(context *gin.Context) {
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, start, end)
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.GetURLCheckPageSize())

context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, pageNum, totalPages, pageInfo,
inaccessibleCount, start, end))
context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, sessionId, pageNum,
totalPages, pageInfo, inaccessibleCount, start, end))
}
39 changes: 33 additions & 6 deletions handlers/scrape_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"net"
"net/http"
"net/http/httptest"
"reflect"
"scraper/models"
"scraper/services"
"scraper/storage"
Expand All @@ -23,6 +24,7 @@ func TestScrapeHandler(test_type *testing.T) {
mockPageInfo *models.PageInfo
mockError error
mockRequestID string
mockSessionID string
expectedStatus int
expectedBody map[string]interface{}
}{
Expand All @@ -42,6 +44,7 @@ func TestScrapeHandler(test_type *testing.T) {
},
mockError: nil,
mockRequestID: "mockRequestID",
mockSessionID: "mockSessionID",
expectedStatus: http.StatusOK,
},
{
Expand All @@ -62,6 +65,7 @@ func TestScrapeHandler(test_type *testing.T) {
mockPageInfo: nil,
mockError: assert.AnError,
mockRequestID: "",
mockSessionID: "",
expectedStatus: http.StatusInternalServerError,
expectedBody: map[string]interface{}{
"error": "An unexpected error occurred",
Expand All @@ -77,6 +81,7 @@ func TestScrapeHandler(test_type *testing.T) {
IsTimeout: true,
},
mockRequestID: "",
mockSessionID: "",
expectedStatus: http.StatusGatewayTimeout,
expectedBody: map[string]interface{}{
"error": "Request timeout during the page fetch",
Expand All @@ -92,6 +97,7 @@ func TestScrapeHandler(test_type *testing.T) {
IsTimeout: false,
},
mockRequestID: "",
mockSessionID: "",
expectedStatus: http.StatusBadGateway,
expectedBody: map[string]interface{}{
"error": "Failed to reach the requested URL",
Expand All @@ -105,6 +111,7 @@ func TestScrapeHandler(test_type *testing.T) {
mockPageInfo: nil,
mockError: nil,
mockRequestID: "",
mockSessionID: "",
expectedStatus: http.StatusBadRequest,
expectedBody: map[string]interface{}{
"error": "Invalid URL format, please provide a valid URL.",
Expand All @@ -121,10 +128,21 @@ func TestScrapeHandler(test_type *testing.T) {
})
defer patchFetchPageInfo.Unpatch()

patchStorePageInfo := monkey.Patch(storage.StorePageInfo,
func(info *models.PageInfo) string {
return test_data.mockRequestID
patchRetriveDatabase := monkey.Patch(storage.RetriveDatabase,
func(sessionID string) storage.Database {
return &storage.InMemoryDatabase{Data: make(map[string]models.PageInfo, 10_000)}
})
defer patchRetriveDatabase.Unpatch()

patchStorePageInfo := monkey.PatchInstanceMethod(
reflect.TypeOf(&storage.InMemoryDatabase{}), // Type of the struct
"StorePageInfo", // Method name to patch
func(db *storage.InMemoryDatabase, info *models.PageInfo) string {
// Mocked implementation
db.Data[test_data.mockRequestID] = *info
return test_data.mockRequestID
},
)
defer patchStorePageInfo.Unpatch()

router := gin.Default()
Expand Down Expand Up @@ -205,10 +223,19 @@ func TestPageHandler(test_type *testing.T) {
for _, test_data := range tests {
test_type.Run(test_data.name, func(test_type *testing.T) {

patchRetrievePageInfo := monkey.Patch(storage.RetrievePageInfo,
func(id string) (*models.PageInfo, bool) {
return test_data.mockPageInfo, test_data.mockExists
patchRetriveDatabase := monkey.Patch(storage.RetriveDatabase,
func(sessionID string) storage.Database {
return &storage.InMemoryDatabase{Data: make(map[string]models.PageInfo, 10_000)}
})
defer patchRetriveDatabase.Unpatch()

patchRetrievePageInfo := monkey.PatchInstanceMethod(
reflect.TypeOf(&storage.InMemoryDatabase{}), // Type of the struct
"RetrievePageInfo", // Method name to patch
func(db *storage.InMemoryDatabase, id string) (*models.PageInfo, bool) {
return test_data.mockPageInfo, test_data.mockExists
},
)
defer patchRetrievePageInfo.Unpatch()

patchCalculatePageBounds := monkey.Patch(utils.CalculatePageBounds,
Expand Down
1 change: 1 addition & 0 deletions models/response.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type PaginatedURLs struct {

type PageResponse struct {
RequestID string `json:"request_id"`
SessionId string `json:"session_id"`
Pagination Pagination `json:"pagination"`
Scraped ScrapedData `json:"scraped"`
}
85 changes: 70 additions & 15 deletions storage/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,93 @@
package storage

import (
"fmt"
"math/rand"
"scraper/logger"
"scraper/models"
"sync"
"time"
)

var storage = struct {
// Database registry to store dbs per user session
// Here we set the initial capacity to 10000 to reduce resizing overhead.
var dbRegistry = struct {
sync.RWMutex
data map[string]models.PageInfo
}{data: make(map[string]models.PageInfo)}
dbs map[string]Database
}{dbs: make(map[string]Database, 10_000)}

// This is to store page info.
func StorePageInfo(info *models.PageInfo) string {
storage.Lock()
defer storage.Unlock()
type Database interface {
StorePageInfo(info *models.PageInfo) string
RetrievePageInfo(id string) (*models.PageInfo, bool)
}

type InMemoryDatabase struct {
sync.RWMutex
Data map[string]models.PageInfo
}

// This is to retrieve the database by session ID.
// If the database does not exist, it will create a new one for the current session.
// Here we set the initial capacity to 10000 to reduce resizing overhead.
func RetriveDatabase(sessionID string) Database {
dbRegistry.RLock()
defer dbRegistry.RUnlock()

db, exists := dbRegistry.dbs[sessionID]
if !exists {
db = &InMemoryDatabase{Data: make(map[string]models.PageInfo, 10_000)}
dbRegistry.dbs[sessionID] = db
}
return db
}

// This is to store page info in the given database.
func (db *InMemoryDatabase) StorePageInfo(info *models.PageInfo) string {
db.Lock()
defer db.Unlock()

id := generateID()
storage.data[id] = *info
id := GenerateID()
db.Data[id] = *info
logger.Debug(fmt.Sprintf("Updated database with page info:\n %v", db.Data))
return id
}

// This is to retrieve page info by unique ID.
func RetrievePageInfo(id string) (*models.PageInfo, bool) {
storage.RLock()
defer storage.RUnlock()
// This is to retrieve page info by unique ID from the given database.
func (db *InMemoryDatabase) RetrievePageInfo(id string) (*models.PageInfo, bool) {
db.RLock()
defer db.RUnlock()

info, exists := storage.data[id]
info, exists := db.Data[id]
logger.Debug(fmt.Sprintf("Retrieved page info from:\n %v", db.Data))
return &info, exists
}

// var storage = struct {
// sync.RWMutex
// data map[string]models.PageInfo
// }{data: make(map[string]models.PageInfo)}

// // This is to store page info.
// func StorePageInfo(info *models.PageInfo) string {
// storage.Lock()
// defer storage.Unlock()

// id := generateID()
// storage.data[id] = *info
// return id
// }

// // This is to retrieve page info by unique ID.
// func RetrievePageInfo(id string) (*models.PageInfo, bool) {
// storage.RLock()
// defer storage.RUnlock()

// info, exists := storage.data[id]
// return &info, exists
// }

// This is to generate the random unique ID.
func generateID() string {
func GenerateID() string {
return time.Now().Format("20060102150405") + "-" + randomString(8)
}

Expand Down
11 changes: 7 additions & 4 deletions storage/memory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ func TestStorePageInfo(test_type *testing.T) {
Title: "Test Page",
}

id := StorePageInfo(pageInfo)
database := RetriveDatabase("test-session")
id := database.StorePageInfo(pageInfo)

// Ensure the ID is not empty
assert.NotEmpty(test_type, id, "Generated ID should not be empty")

retrievedInfo, exists := RetrievePageInfo(id)
retrievedInfo, exists := database.RetrievePageInfo(id)

// Assert that the PageInfo exists
assert.True(test_type, exists, "Stored PageInfo should be retrievable")
Expand All @@ -26,8 +27,9 @@ func TestStorePageInfo(test_type *testing.T) {
}

func TestRetrievePageInfo_NotFound(test_type *testing.T) {
database := RetriveDatabase("test-session")
// Try retrieving a non-existent PageInfo
retrievedInfo, exists := RetrievePageInfo("nonexistent-id")
retrievedInfo, exists := database.RetrievePageInfo("nonexistent-id")

// Assert that the info does not exist
assert.False(test_type, exists, "Non-existent ID should not be found")
Expand All @@ -39,7 +41,8 @@ func TestRetrievePageInfo_NotFound(test_type *testing.T) {
func TestGenerateID(test_type *testing.T) {
// Call the private function indirectly by calling StorePageInfo
pageInfo := &models.PageInfo{Title: "Test Page"}
id := StorePageInfo(pageInfo)
database := RetriveDatabase("test-session")
id := database.StorePageInfo(pageInfo)

// Assert that the ID follows the expected format
assert.Regexp(test_type, `^\d{14}-[a-zA-Z0-9]{8}$`, id,
Expand Down
9 changes: 5 additions & 4 deletions utils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,21 @@ func min(a, b int) int {
}

// This is to build the response after a successful scraping.
func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *models.PageInfo,
inaccessible, start, end int) models.PageResponse {
func BuildPageResponse(requestID string, sessionID string, pageNum, totalPages int,
pageInfo *models.PageInfo, inaccessible, start, end int) models.PageResponse {
var prevPage, nextPage *string
if pageNum > 1 {
prev := fmt.Sprintf("/scrape/%s/%d", requestID, pageNum-1)
prev := fmt.Sprintf("/scrape/%s/%s/%d", sessionID, requestID, pageNum-1)
prevPage = &prev
}
if end < len(pageInfo.URLs) {
next := fmt.Sprintf("/scrape/%s/%d", requestID, pageNum+1)
next := fmt.Sprintf("/scrape/%s/%s/%d", sessionID, requestID, pageNum+1)
nextPage = &next
}

return models.PageResponse{
RequestID: requestID,
SessionId: sessionID,
Pagination: models.Pagination{
PageSize: config.GetURLCheckPageSize(),
CurrentPage: pageNum,
Expand Down
Loading