前言

由于 PocketBase 自带了一个简洁、美观的管理后台,且加上其单文件运行、支持 S3 备份等特性,集成爬虫服务后,非常方便查询管理入库后的数据。前两篇博客(PocketBase:极简全能后端简单扩展 PocketBase 功能)已经在「理论」上简单介绍了 PocketBase,这篇博客则进行简单的实操。

本次的目标是在 PocketBase 内集成爬虫服务,爬取 GoodReads(类似豆瓣读书)的摘录入库。

技术栈:

Goodreads 相关摘录页分析

  • Goodreads 摘录按两种类型聚合:基于书籍基于摘录分类

        graph TD
          书籍文体分类 --> 书籍 --> 摘录
          书籍作者 --> 书籍 --> 摘录
          摘录分类 --> 摘录
  • 两种聚合类型最后展示「摘录」的 HTML 结构完全一致:

    goodreads摘录HTML结构
    goodreads摘录HTML结构
  • 相关链接:

    • 摘录列表,可按分类筛选:
      • https://www.goodreads.com/quotes?page=1
      • https://www.goodreads.com/quotes/tag/<tag-name>?page=1
    • 书籍文体分类列表:https://www.goodreads.com/genres/list?page=1
    • 作者下的所有书籍列表:https://www.goodreads.com/author/list/<author-code>
    • 书籍介绍:https://www.goodreads.com/book/show/<book-code>
    • 书籍下的摘录列表:https://www.goodreads.com/work/quotes/<book-quote-code>

本次采用:书籍作者 → 书籍 → 摘录 这个链路操作。
「书籍作者」的编码可以从 3 遍历到 20 作为示例。

目录结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
.
├── config
│ └── init.go
├── crawlers
│ ├── goodreads.go
│ └── goodreads_quote.go
├── go.mod
├── go.sum
├── main.go
├── pb_data
├── pb_migrations
├── README.md
└── repository
└── goodreads_qoute.go

代码及操作

  • 初始化项目:go mod init pocketbase-goodreads

  • 添加代码:

    config/init.go

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    package config

    import (
    "os"
    "path/filepath"
    "strings"
    "time"

    "github.com/pocketbase/pocketbase"
    "github.com/pocketbase/pocketbase/apis"
    "github.com/pocketbase/pocketbase/core"
    "github.com/pocketbase/pocketbase/plugins/ghupdate"
    "github.com/pocketbase/pocketbase/plugins/jsvm"
    "github.com/pocketbase/pocketbase/plugins/migratecmd"
    )

    func defaultPublicDir() string {
    if strings.HasPrefix(os.Args[0], os.TempDir()) {
    return "./pb_public"
    }

    return filepath.Join(os.Args[0], "../pb_public")
    }

    func InitApp(app *pocketbase.PocketBase) {
    var hooksDir string
    app.RootCmd.PersistentFlags().StringVar(
    &hooksDir,
    "hooksDir",
    "",
    "the directory with the JS app hooks",
    )

    var hooksWatch bool
    app.RootCmd.PersistentFlags().BoolVar(
    &hooksWatch,
    "hooksWatch",
    true,
    "auto restart the app on pb_hooks file change",
    )

    var hooksPool int
    app.RootCmd.PersistentFlags().IntVar(
    &hooksPool,
    "hooksPool",
    25,
    "the total prewarm goja.Runtime instances for the JS app hooks execution",
    )

    var migrationsDir string
    app.RootCmd.PersistentFlags().StringVar(
    &migrationsDir,
    "migrationsDir",
    "",
    "the directory with the user defined migrations",
    )

    var automigrate bool
    app.RootCmd.PersistentFlags().BoolVar(
    &automigrate,
    "automigrate",
    true,
    "enable/disable auto migrations",
    )

    var publicDir string
    app.RootCmd.PersistentFlags().StringVar(
    &publicDir,
    "publicDir",
    defaultPublicDir(),
    "the directory to serve static files",
    )

    var indexFallback bool
    app.RootCmd.PersistentFlags().BoolVar(
    &indexFallback,
    "indexFallback",
    true,
    "fallback the request to index.html on missing static path (eg. when pretty urls are used with SPA)",
    )

    var queryTimeout int
    app.RootCmd.PersistentFlags().IntVar(
    &queryTimeout,
    "queryTimeout",
    30,
    "the default SELECT queries timeout in seconds",
    )

    app.RootCmd.ParseFlags(os.Args[1:])

    jsvm.MustRegister(app, jsvm.Config{
    MigrationsDir: migrationsDir,
    HooksDir: hooksDir,
    HooksWatch: hooksWatch,
    HooksPoolSize: hooksPool,
    })

    migratecmd.MustRegister(app, app.RootCmd, migratecmd.Config{
    TemplateLang: migratecmd.TemplateLangJS,
    Automigrate: automigrate,
    Dir: migrationsDir,
    })

    ghupdate.MustRegister(app, app.RootCmd, ghupdate.Config{})

    app.OnAfterBootstrap().PreAdd(func(e *core.BootstrapEvent) error {
    app.Dao().ModelQueryTimeout = time.Duration(queryTimeout) * time.Second
    return nil
    })

    app.OnBeforeServe().Add(func(e *core.ServeEvent) error {
    e.Router.GET("/*", apis.StaticDirectoryHandler(os.DirFS(publicDir), indexFallback))
    return nil
    })
    }

    main.go

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    package main

    import (
    "log"
    "pocketbase-goodreads/config"

    "github.com/pocketbase/pocketbase"
    )

    func main() {
    app := pocketbase.New()

    config.InitApp(app)

    if err := app.Start(); err != nil {
    log.Fatal(err)
    }
    }

  • 安装依赖:go mod tidy

  • 运行 PocketBase:go run . serve

  • 打开网页注册登录超级用户:http://127.0.0.1:8090/_/

  • 创建 quotes 集合(Collection),包含字段:

    字段名称 字段类型 其他 说明
    Content Text Nonempty, Unique index 摘录内容
    Tags Text 摘录标签
    Author Text 作者
    Like Number 摘录被喜欢数量
    在PocketBase后台创建Goodreads摘录集合
    在PocketBase后台创建Goodreads摘录集合
  • 继续添加完善代码:

    repository/goodreads_qoute.go

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    package repository

    import (
    "log"
    "strconv"
    "strings"

    "github.com/pocketbase/dbx"
    "github.com/pocketbase/pocketbase"
    "github.com/pocketbase/pocketbase/forms"
    "github.com/pocketbase/pocketbase/models"
    )

    var GoodReadsQuoteCollectionName = "quotes"

    func CreateNewGoodReadsQuote(app *pocketbase.PocketBase, data map[string]any) (*models.Record, error) {
    collection, err := app.Dao().FindCollectionByNameOrId(GoodReadsQuoteCollectionName)
    if err != nil {
    log.Fatalln(
    "Not Found of Collection: "+GoodReadsQuoteCollectionName,
    "status", 404,
    "error", err,
    )
    app.Logger().Error(
    "Not Found of Collection: "+GoodReadsQuoteCollectionName,
    "status", 404,
    "error", err,
    )
    return nil, err
    }

    record := models.NewRecord(collection)
    form := forms.NewRecordUpsert(app, record)
    form.LoadData(data)

    if err := form.Submit(); err != nil {
    if !strings.Contains(err.Error(), "must be unique") {
    code := strconv.Itoa(record.GetInt("code"))
    log.Fatalln(
    "Failed to create Quotes record, quote code: "+code,
    "status", 400,
    "error", err,
    "data", data,
    )
    app.Logger().Error(
    "Failed to create Quotes record, quote code: "+code,
    "status", 400,
    "error", err,
    "data", data,
    )
    }
    return nil, err
    }

    return record, nil
    }

    func GetGoodReadsQuoteById(app *pocketbase.PocketBase, id string) (*models.Record, error) {
    record, err := app.Dao().FindRecordById(GoodReadsQuoteCollectionName, id)
    if err != nil {
    return nil, err
    }
    return record, nil
    }

    func GetGoodReadsQuotesList(app *pocketbase.PocketBase) ([]*models.Record, error) {
    records, err := app.Dao().FindRecordsByFilter(
    GoodReadsQuoteCollectionName,
    "id = {:id}",
    "-created",
    10,
    0,
    dbx.Params{"id": "id"},
    )
    if err != nil {
    return nil, err
    }
    return records, err
    }

    func UpdateGoodReadsQuote(app *pocketbase.PocketBase, data map[string]any) error {
    id := data["id"].(string)
    if id == "" {
    return nil
    }

    record, err := app.Dao().FindRecordById(GoodReadsQuoteCollectionName, id)
    if err != nil {
    return err
    }

    form := forms.NewRecordUpsert(app, record)
    form.LoadData(data)

    if err := form.Submit(); err != nil {
    return err
    }

    return nil
    }

    func DeleteGoodReadsQuote(app *pocketbase.PocketBase, id string) error {
    record, err := app.Dao().FindRecordById(GoodReadsQuoteCollectionName, id)
    if err != nil {
    return err
    }

    if err := app.Dao().DeleteRecord(record); err != nil {
    return err
    }

    return nil
    }

    crawlers/goodreads_quote.go

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    package crawlers

    import (
    "log"
    "pocketbase-goodreads/repository"
    "regexp"
    "strconv"
    "strings"

    "github.com/PuerkitoBio/goquery"
    "github.com/gocolly/colly/v2"
    "github.com/pocketbase/pocketbase"
    )

    var quoteSymbolsReg = regexp.MustCompile(`[“”]`)
    var whiteSpaceReg = regexp.MustCompile(`\s+`)

    func GoodReadsQuoteCrawler(app *pocketbase.PocketBase, c *colly.Collector, link string) {
    c.Visit(link)

    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    nextLink := e.Request.AbsoluteURL(link)
    if QuoteLinkRegexp.MatchString(nextLink) {
    // prevent duplicated crawl for the 1st quote page
    if !FirstQuoteLinkPageRegexp.MatchString(nextLink) {
    e.Request.Visit(link)
    }
    }
    })

    c.OnScraped(func(r *colly.Response) {
    url := r.Request.URL.String()
    log.Println(
    "Requested: "+url,
    "status", r.StatusCode,
    "model", "colly:good-reads:quote",
    )
    app.Logger().Info(
    "Requested: "+url,
    "status", r.StatusCode,
    "model", "colly:good-reads:quote",
    )
    })

    c.OnError(func(r *colly.Response, err error) {
    url := r.Request.URL.String()
    log.Println(
    "Failed to request: "+url,
    "error", err,
    "status", r.StatusCode,
    "model", "colly:good-reads:quote",
    )
    app.Logger().Error(
    "Failed to request: "+url,
    "error", err,
    "status", r.StatusCode,
    "model", "colly:good-reads:quote",
    )
    })

    c.OnHTML(".leftContainer", func(h *colly.HTMLElement) {

    h.DOM.Find(`.quotes .quote .quoteDetails`).Each(func(i int, s *goquery.Selection) {
    content := quoteSymbolsReg.ReplaceAllString(strings.TrimSpace(strings.Split(h.DOM.Find(".quoteText").Text(), "―")[0]), "")
    tags := whiteSpaceReg.ReplaceAllString(strings.TrimSpace(strings.Replace(h.DOM.Find(".quoteFooter .left").Text(), "tags:", "", -1)), "")
    author := strings.TrimSpace(strings.Replace(h.DOM.Find("span.authorOrTitle").Text(), ",", "", -1))
    like, ok := strconv.Atoi(strings.TrimSpace(strings.Replace(s.Find(".quoteFooter .right a").Text(), "likes", "", -1)))
    if ok != nil {
    like = 0
    }

    q := map[string]interface{}{}
    q["Content"] = content
    q["Tags"] = tags
    q["Like"] = like
    q["Author"] = author

    repository.CreateNewGoodReadsQuote(app, q)
    })
    })
    }

    crawlers/goodreads.go

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    package crawlers

    import (
    "fmt"
    "regexp"
    "time"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/extensions"
    "github.com/pocketbase/pocketbase"
    "github.com/pocketbase/pocketbase/core"
    )

    var GenresLinkRegexp = regexp.MustCompile(`https://www\.goodreads\.com/genres/.+$`)
    var ShelfLinkRegexp = regexp.MustCompile(`https://www\.goodreads\.com/shelf/.+$`)
    var AuthorLinkRegexp = regexp.MustCompile(`(?m)https:\/\/www\.goodreads\.com\/author\/show\/\d+\.[[:alnum:]\-\._]+$`)
    var BookLinkRegexp = regexp.MustCompile(`https://www\.goodreads\.com/book/show/[[:alnum:]\-\._]+$`)
    var QuoteLinkRegexp = regexp.MustCompile(`https://www\.goodreads\.com/work/quotes/.+$`)
    var AuthorAllBooksLinkRegexp = regexp.MustCompile(`https:\/\/www\.goodreads\.com\/author\/list\/.+$`)

    var FirstQuoteLinkPageRegexp = regexp.MustCompile(`https://www\.goodreads\.com/work/quotes/[[:alnum:]\-\._]+\?page\=1$`)

    // InitGoodReadsCrawler inits the goodreads crawler after pocketbase bootstrapped
    func InitGoodReadsCrawler(app *pocketbase.PocketBase) {
    app.OnAfterBootstrap().Add(func(e *core.BootstrapEvent) error {
    go func() {
    GoodReadsCrawler(app)
    }()
    return nil
    })
    }

    func GoodReadsCrawler(app *pocketbase.PocketBase) {

    c := colly.NewCollector(
    colly.Async(),
    colly.AllowedDomains("www.goodreads.com"),
    colly.URLFilters(
    BookLinkRegexp,
    GenresLinkRegexp,
    ShelfLinkRegexp,
    QuoteLinkRegexp,
    AuthorLinkRegexp,
    AuthorAllBooksLinkRegexp,
    ),
    )

    c.SetRequestTimeout(10 * time.Second)

    extensions.RandomUserAgent(c)

    quoteCollector := c.Clone()

    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    link := e.Attr("href")
    nextLink := e.Request.AbsoluteURL(link)

    if QuoteLinkRegexp.MatchString(nextLink) {
    // prevent duplicated crawl for the 1st quote page
    if !FirstQuoteLinkPageRegexp.MatchString(nextLink) {
    GoodReadsQuoteCrawler(app, quoteCollector, nextLink)
    }
    } else {
    e.Request.Visit(link)
    }
    })

    c.OnScraped(func(r *colly.Response) {
    app.Logger().Info(
    "Requested: "+r.Request.URL.String(),
    "status", r.StatusCode,
    "model", "colly:good-reads",
    )
    })

    c.OnError(func(r *colly.Response, err error) {
    app.Logger().Error(
    "Failed to request: "+r.Request.URL.String(),
    "error", err,
    "status", r.StatusCode,
    "model", "colly:good-reads",
    )
    })

    // Set max Parallelism and introduce a Random Delay
    c.Limit(&colly.LimitRule{
    Parallelism: 1,
    RandomDelay: 10 * time.Second,
    DomainGlob: "https://www.goodreads.com/*",
    })

    // 仅爬取编号从 3 到 20 内存在的作者作为示例
    codes := []int{3, 20}
    for _, code := range codes {
    c.Visit(fmt.Sprintf("https://www.goodreads.com/author/list/%d", code))
    }
    c.Wait()
    }

    main.go

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    package main

    import (
    "log"
    "pocketbase-goodreads/config"

    "pocketbase-goodreads/crawlers"

    "github.com/pocketbase/pocketbase"
    )

    func main() {
    app := pocketbase.New()

    config.InitApp(app)

    // 注册 Goodreads 爬虫服务
    crawlers.InitGoodReadsCrawler(app)

    if err := app.Start(); err != nil {
    log.Fatal(err)
    }
    }
  • 重启 PocketBase:go run . serve

    Goodreads爬虫运行结果
    Goodreads爬虫运行结果

最后

存在问题:

  • 使用 Goroutine 将 Goodreads 爬虫服务挂载在 PocketBase 的 OnAfterBootstrap 钩子上,PocketBase 和爬虫同时运行时导致后台操作阻塞。

考虑到的解决方案:

不确定是否是 Goroutine 使用不正确的原因,继续使用协程需寻求 Go 层级上的解决方案。
「按需爬取」:爬虫服务不再挂载到 PocketBase 上,也不再跟随 PocketBase 服务实时启动,将提供 API 方式获取爬取指令后再「按需爬取」。如:按书籍作者则爬取书籍作者下所有书籍的摘录、按摘录分类爬取、按单本书籍爬取等。