Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Flyscrape is a command-line web scraping tool designed for those without <br />a
- **Scriptable:** Use JavaScript to write your data extraction logic.
- **System Cookies:** Give Flyscrape access to your browsers cookie store.
- **Browser Mode:** Render JavaScript heavy pages using a headless Browser.
- **Nested Scraping:** Extract data from linked pages within a single scrape.

## Overview

Expand Down Expand Up @@ -259,10 +260,20 @@ export const config = {
},
};

export default function ({ doc, url, absoluteURL }) {
// doc - Contains the parsed HTML document
// url - Contains the scraped URL
// absoluteURL(...) - Transforms relative URLs into absolute URLs
export default function ({ doc, url, absoluteURL, scrape }) {
// doc
// Contains the parsed HTML document.

// url
// Contains the scraped URL.

// absoluteURL("/foo")
// Transforms a relative URL into absolute URL.

// scrape(url, function({ doc, url, absoluteURL, scrape }) {
// return { ... };
// })
// Scrapes a linked page and returns the scrape result.
}
```

Expand Down
23 changes: 23 additions & 0 deletions examples/hackernews_with_comments.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
export const config = {
url: "https://news.ycombinator.com/",
};

export default function({ doc, scrape }) {
const post = doc.find(".athing.submission").first();
const title = post.find(".titleline > a").text();
const commentsLink = post.next().find("a").last().attr("href");

const comments = scrape(commentsLink, function({ doc }) {
return doc.find(".comtr").map(comment => {
return {
author: comment.find(".hnuser").text(),
text: comment.find(".commtext").text(),
};
});
});

return {
title,
comments,
};
}
49 changes: 39 additions & 10 deletions js.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ var ScriptTemplate []byte
type Config []byte

type ScrapeParams struct {
HTML string
URL string
HTML string
URL string
Process func(url string) ([]byte, error)
}

type ScrapeFunc func(ScrapeParams) (any, error)
Expand Down Expand Up @@ -167,26 +168,21 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) {
return nil, errors.New("failed to export scrape function")
}

return func(p ScrapeParams) (any, error) {
lock.Lock()
defer lock.Unlock()

var newArg func(p ScrapeParams) (*goja.Object, error)
newArg = func(p ScrapeParams) (*goja.Object, error) {
doc, err := DocumentFromString(p.HTML)
if err != nil {
log.Println(err)
return nil, err
}

baseurl, err := url.Parse(p.URL)
if err != nil {
log.Println(err)
return nil, err
}

absoluteURL := func(ref string) string {
abs, err := baseurl.Parse(ref)
if err != nil {
log.Println(err)
return ref
}
return abs.String()
Expand All @@ -196,8 +192,41 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) {
o.Set("url", p.URL)
o.Set("doc", doc)
o.Set("absoluteURL", absoluteURL)
o.Set("scrape", func(url string, f func(goja.FunctionCall) goja.Value) goja.Value {
url = absoluteURL(url)

html, err := p.Process(url)
if err != nil {
return vm.ToValue(map[string]any{"error": err.Error()})
}

newp := ScrapeParams{
HTML: string(html),
URL: url,
Process: p.Process,
}

arg, err := newArg(newp)
if err != nil {
return vm.ToValue(map[string]any{"error": err.Error()})
}

return f(goja.FunctionCall{Arguments: []goja.Value{arg}})
})

return o, nil
}

return func(p ScrapeParams) (any, error) {
lock.Lock()
defer lock.Unlock()

arg, err := newArg(p)
if err != nil {
return nil, err
}

ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{o}})
ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{arg}})
if goja.IsUndefined(ret) {
return nil, nil
}
Expand Down
7 changes: 4 additions & 3 deletions js_lib_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"encoding/json"
"net/http"
"os"
"sync/atomic"
"testing"

"github.com/philippta/flyscrape"
Expand Down Expand Up @@ -203,10 +204,10 @@ func TestJSLibHTTPDownload(t *testing.T) {
http.download("https://example.com/404.txt");
`

nreqs := 0
var nreqs atomic.Int32
client := &http.Client{
Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
nreqs++
nreqs.Add(1)

if r.URL.Path == "/content-disposition" {
resp, err := flyscrape.MockResponse(200, "hello world")
Expand All @@ -233,7 +234,7 @@ func TestJSLibHTTPDownload(t *testing.T) {

wait()

require.Equal(t, nreqs, 8)
require.Equal(t, nreqs.Load(), int32(8))
require.FileExists(t, "foo.txt")
require.FileExists(t, "dir/my-foo.txt")
require.FileExists(t, "dir/bar.txt")
Expand Down
92 changes: 92 additions & 0 deletions js_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,98 @@ func TestJSScrapeNaN(t *testing.T) {
require.Nil(t, result)
}

func TestJSScrapeParamURL(t *testing.T) {
js := `
export default function({ url }) {
return url;
}
`
exports, err := flyscrape.Compile(js, nil)
require.NoError(t, err)

result, err := exports.Scrape(flyscrape.ScrapeParams{
HTML: html,
URL: "http://localhost/",
})
require.NoError(t, err)
require.Equal(t, "http://localhost/", result)
}

func TestJSScrapeParamAbsoluteURL(t *testing.T) {
js := `
export default function({ absoluteURL }) {
return absoluteURL("/foo");
}
`
exports, err := flyscrape.Compile(js, nil)
require.NoError(t, err)

result, err := exports.Scrape(flyscrape.ScrapeParams{
HTML: html,
URL: "http://localhost/",
})
require.NoError(t, err)
require.Equal(t, "http://localhost/foo", result)
}

func TestJSScrapeParamScrape(t *testing.T) {
js := `
export default function({ scrape }) {
return scrape("/foo", function({ url }) {
return {
url: url,
foo: "bar",
};
});
}
`
exports, err := flyscrape.Compile(js, nil)
require.NoError(t, err)

result, err := exports.Scrape(flyscrape.ScrapeParams{
HTML: html,
URL: "http://localhost/",
Process: func(url string) ([]byte, error) {
return nil, nil
},
})
require.NoError(t, err)
require.Equal(t, map[string]any{
"url": "http://localhost/foo",
"foo": "bar",
}, result)
}

func TestJSScrapeParamScrapeDeep(t *testing.T) {
js := `
export default function({ scrape }) {
return scrape("/foo/", function({ url, scrape }) {
return {
url: url,
deep: scrape("bar", function({ url }) {
return url;
}),
};
});
}
`
exports, err := flyscrape.Compile(js, nil)
require.NoError(t, err)

result, err := exports.Scrape(flyscrape.ScrapeParams{
HTML: html,
URL: "http://localhost/",
Process: func(url string) ([]byte, error) {
return nil, nil
},
})
require.NoError(t, err)
require.Equal(t, map[string]any{
"url": "http://localhost/foo/",
"deep": "http://localhost/foo/bar",
}, result)
}

func TestJSCompileError(t *testing.T) {
exports, err := flyscrape.Compile("import foo;", nil)
require.Error(t, err)
Expand Down
3 changes: 3 additions & 0 deletions module.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ func LoadModules(cfg Config) []Module {

// load standard modules in order
for _, id := range moduleOrder {
if _, ok := loaded[id]; ok {
continue
}
mod := modules[id].ModuleInfo().New()
if err := json.Unmarshal(cfg, mod); err != nil {
panic("failed to decode config: " + err.Error())
Expand Down
54 changes: 53 additions & 1 deletion scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,13 @@ func (s *Scraper) process(url string, depth int) {
}
}()

response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL})
p := ScrapeParams{
HTML: string(response.Body),
URL: request.URL,
Process: s.processImmediate,
}

response.Data, err = s.ScrapeFunc(p)
if err != nil {
response.Error = err
return
Expand All @@ -212,6 +218,52 @@ func (s *Scraper) process(url string, depth int) {
}
}

func (s *Scraper) processImmediate(url string) ([]byte, error) {
request := &Request{
Method: http.MethodGet,
URL: url,
Headers: http.Header{},
Cookies: s.Client.Jar,
}

for _, mod := range s.Modules {
if v, ok := mod.(RequestBuilder); ok {
v.BuildRequest(request)
}
}

req, err := http.NewRequest(request.Method, request.URL, nil)
if err != nil {
return nil, err
}
req.Header = request.Headers

for _, mod := range s.Modules {
if v, ok := mod.(RequestValidator); ok {
if !v.ValidateRequest(request) {
return nil, nil
}
}
}

resp, err := s.Client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()

if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, fmt.Errorf("%d %s", resp.StatusCode, http.StatusText(resp.StatusCode))
}

body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

return body, nil
}

func (s *Scraper) enqueueJob(url string, depth int) {
url = strings.TrimSpace(url)
if url == "" {
Expand Down
Loading