diff --git a/CocoCrawler/Builders/PageCrawlJobBuilder.cs b/CocoCrawler/Builders/PageCrawlJobBuilder.cs index 5a00b82..3ba47c9 100644 --- a/CocoCrawler/Builders/PageCrawlJobBuilder.cs +++ b/CocoCrawler/Builders/PageCrawlJobBuilder.cs @@ -1,4 +1,5 @@ -using CocoCrawler.CrawlOutputs; +using AngleSharp.Dom; +using CocoCrawler.CrawlOutputs; using CocoCrawler.Exceptions; using CocoCrawler.Job; using CocoCrawler.Job.PageBrowserActions; @@ -60,14 +61,17 @@ public PageCrawlJobBuilder ConfigurePageActions(Action optio return this; } + + /// /// Adds a task to open a page and perform openLinks tasks. /// /// The CSS selector to select the element to openLinks. /// The array of openLinks tasks to perform. /// The action to configure the page actions for the openLinks tasks. + /// A function to execute for each matching element, that produces the URL to follow. /// The updated instance. - public PageCrawlJobBuilder OpenLinks(string linksSelector, Action jobOptions, Action? options = null) + public PageCrawlJobBuilder OpenLinks(string linksSelector, Action jobOptions, Action? options = null, Func? linksSelectorFunc = null) { PageActionsBuilder? pageActionsBuilder = null; @@ -82,7 +86,7 @@ public PageCrawlJobBuilder OpenLinks(string linksSelector, Action? LinkProcessor { get; } + public PageCrawlJobBuilder JobBuilder { get; init; } + + public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null) + { + OpenLinksSelector = linksSelector; + PageActions = pageActions; + JobBuilder = builder; + } + + public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null, Func? linkProcessor = null) + { + OpenLinksSelector = linksSelector; + PageActions = pageActions; + LinkProcessor = linkProcessor; + JobBuilder = builder; + } } diff --git a/CocoCrawler/Crawler/PuppeteerCrawler.cs b/CocoCrawler/Crawler/PuppeteerCrawler.cs index cf5e155..629b7d7 100644 --- a/CocoCrawler/Crawler/PuppeteerCrawler.cs +++ b/CocoCrawler/Crawler/PuppeteerCrawler.cs @@ -115,7 +115,7 @@ protected virtual void HandlePaginateTask(CrawlPagePaginateTask paginate, PageCr protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List newJobs) { - var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector); + var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector, openLinks.LinkProcessor); Logger?.LogDebug("OpenLinks selector returned {Count} Urls found in openLinks task.", urls.Length); diff --git a/CocoCrawler/Parser/AngleSharpParser.cs b/CocoCrawler/Parser/AngleSharpParser.cs index 84403cb..b4156bd 100644 --- a/CocoCrawler/Parser/AngleSharpParser.cs +++ b/CocoCrawler/Parser/AngleSharpParser.cs @@ -17,10 +17,12 @@ public virtual async Task Init(string html) _document = await context.OpenAsync(req => req.Content(html)); } - public virtual string[] ParseForLinks(string linksSelector) + public virtual string[] ParseForLinks(string linksSelector, Func? linkProcessor = null) { + linkProcessor ??= (element) => element.GetAttribute("href"); + return _document!.QuerySelectorAll(linksSelector) - .Select(link => link.GetAttribute("href")) + .Select(link => linkProcessor(link)) .Where(link => link is not null) .Select(link => link!) .ToArray(); diff --git a/CocoCrawler/Parser/IParser.cs b/CocoCrawler/Parser/IParser.cs index 033d5aa..4669d9b 100644 --- a/CocoCrawler/Parser/IParser.cs +++ b/CocoCrawler/Parser/IParser.cs @@ -1,4 +1,5 @@ -using CocoCrawler.Job.PageTasks; +using AngleSharp.Dom; +using CocoCrawler.Job.PageTasks; using Newtonsoft.Json.Linq; namespace CocoCrawler.Parser; @@ -6,7 +7,7 @@ namespace CocoCrawler.Parser; public interface IParser { Task Init(string html); - string[] ParseForLinks(string linksSelector); + string[] ParseForLinks(string linksSelector, Func? linkProcessor = null); JArray ExtractList(CrawlPageExtractListTask scrapeList); JObject ExtractObject(CrawlPageExtractObjectTask task); }