前言
由于 PocketBase 自带了一个简洁、美观的管理后台,且加上其单文件运行、支持 S3 备份等特性,集成爬虫服务后,非常方便查询管理入库后的数据。前两篇博客(PocketBase:极简全能后端、简单扩展 PocketBase 功能)已经在「理论」上简单介绍了 PocketBase,这篇博客则进行简单的实操。
本次的目标是在 PocketBase 内集成爬虫服务,爬取 GoodReads(类似豆瓣读书)的摘录入库。
Goodreads 相关摘录页分析
Goodreads 摘录按两种类型聚合:基于书籍、基于摘录分类。
graph TD 书籍文体分类 --> 书籍 --> 摘录 书籍作者 --> 书籍 --> 摘录 摘录分类 --> 摘录
两种聚合类型最后展示「摘录」的 HTML 结构完全一致:
相关链接:
- 摘录列表,可按分类筛选:
https://www.goodreads.com/quotes?page=1
https://www.goodreads.com/quotes/tag/<tag-name>?page=1
- 书籍文体分类列表:
https://www.goodreads.com/genres/list?page=1
- 作者下的所有书籍列表:
https://www.goodreads.com/author/list/<author-code>
- 书籍介绍:
https://www.goodreads.com/book/show/<book-code>
- 书籍下的摘录列表:
https://www.goodreads.com/work/quotes/<book-quote-code>
- 摘录列表,可按分类筛选:
本次采用:书籍作者 → 书籍 → 摘录 这个链路操作。
「书籍作者」的编码可以从3
遍历到20
作为示例。
目录结构
1 | . |
代码及操作
初始化项目:
go mod init pocketbase-goodreads
添加代码:
config/init.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116package config
import (
"os"
"path/filepath"
"strings"
"time"
"github.com/pocketbase/pocketbase"
"github.com/pocketbase/pocketbase/apis"
"github.com/pocketbase/pocketbase/core"
"github.com/pocketbase/pocketbase/plugins/ghupdate"
"github.com/pocketbase/pocketbase/plugins/jsvm"
"github.com/pocketbase/pocketbase/plugins/migratecmd"
)
func defaultPublicDir() string {
if strings.HasPrefix(os.Args[0], os.TempDir()) {
return "./pb_public"
}
return filepath.Join(os.Args[0], "../pb_public")
}
func InitApp(app *pocketbase.PocketBase) {
var hooksDir string
app.RootCmd.PersistentFlags().StringVar(
&hooksDir,
"hooksDir",
"",
"the directory with the JS app hooks",
)
var hooksWatch bool
app.RootCmd.PersistentFlags().BoolVar(
&hooksWatch,
"hooksWatch",
true,
"auto restart the app on pb_hooks file change",
)
var hooksPool int
app.RootCmd.PersistentFlags().IntVar(
&hooksPool,
"hooksPool",
25,
"the total prewarm goja.Runtime instances for the JS app hooks execution",
)
var migrationsDir string
app.RootCmd.PersistentFlags().StringVar(
&migrationsDir,
"migrationsDir",
"",
"the directory with the user defined migrations",
)
var automigrate bool
app.RootCmd.PersistentFlags().BoolVar(
&automigrate,
"automigrate",
true,
"enable/disable auto migrations",
)
var publicDir string
app.RootCmd.PersistentFlags().StringVar(
&publicDir,
"publicDir",
defaultPublicDir(),
"the directory to serve static files",
)
var indexFallback bool
app.RootCmd.PersistentFlags().BoolVar(
&indexFallback,
"indexFallback",
true,
"fallback the request to index.html on missing static path (eg. when pretty urls are used with SPA)",
)
var queryTimeout int
app.RootCmd.PersistentFlags().IntVar(
&queryTimeout,
"queryTimeout",
30,
"the default SELECT queries timeout in seconds",
)
app.RootCmd.ParseFlags(os.Args[1:])
jsvm.MustRegister(app, jsvm.Config{
MigrationsDir: migrationsDir,
HooksDir: hooksDir,
HooksWatch: hooksWatch,
HooksPoolSize: hooksPool,
})
migratecmd.MustRegister(app, app.RootCmd, migratecmd.Config{
TemplateLang: migratecmd.TemplateLangJS,
Automigrate: automigrate,
Dir: migrationsDir,
})
ghupdate.MustRegister(app, app.RootCmd, ghupdate.Config{})
app.OnAfterBootstrap().PreAdd(func(e *core.BootstrapEvent) error {
app.Dao().ModelQueryTimeout = time.Duration(queryTimeout) * time.Second
return nil
})
app.OnBeforeServe().Add(func(e *core.ServeEvent) error {
e.Router.GET("/*", apis.StaticDirectoryHandler(os.DirFS(publicDir), indexFallback))
return nil
})
}main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19package main
import (
"log"
"pocketbase-goodreads/config"
"github.com/pocketbase/pocketbase"
)
func main() {
app := pocketbase.New()
config.InitApp(app)
if err := app.Start(); err != nil {
log.Fatal(err)
}
}安装依赖:
go mod tidy
运行 PocketBase:
go run . serve
打开网页注册登录超级用户:
http://127.0.0.1:8090/_/
创建
quotes
集合(Collection),包含字段:字段名称 字段类型 其他 说明 Content Text Nonempty, Unique index 摘录内容 Tags Text 摘录标签 Author Text 作者 Like Number 摘录被喜欢数量 继续添加完善代码:
repository/goodreads_qoute.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113package repository
import (
"log"
"strconv"
"strings"
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase"
"github.com/pocketbase/pocketbase/forms"
"github.com/pocketbase/pocketbase/models"
)
var GoodReadsQuoteCollectionName = "quotes"
func CreateNewGoodReadsQuote(app *pocketbase.PocketBase, data map[string]any) (*models.Record, error) {
collection, err := app.Dao().FindCollectionByNameOrId(GoodReadsQuoteCollectionName)
if err != nil {
log.Fatalln(
"Not Found of Collection: "+GoodReadsQuoteCollectionName,
"status", 404,
"error", err,
)
app.Logger().Error(
"Not Found of Collection: "+GoodReadsQuoteCollectionName,
"status", 404,
"error", err,
)
return nil, err
}
record := models.NewRecord(collection)
form := forms.NewRecordUpsert(app, record)
form.LoadData(data)
if err := form.Submit(); err != nil {
if !strings.Contains(err.Error(), "must be unique") {
code := strconv.Itoa(record.GetInt("code"))
log.Fatalln(
"Failed to create Quotes record, quote code: "+code,
"status", 400,
"error", err,
"data", data,
)
app.Logger().Error(
"Failed to create Quotes record, quote code: "+code,
"status", 400,
"error", err,
"data", data,
)
}
return nil, err
}
return record, nil
}
func GetGoodReadsQuoteById(app *pocketbase.PocketBase, id string) (*models.Record, error) {
record, err := app.Dao().FindRecordById(GoodReadsQuoteCollectionName, id)
if err != nil {
return nil, err
}
return record, nil
}
func GetGoodReadsQuotesList(app *pocketbase.PocketBase) ([]*models.Record, error) {
records, err := app.Dao().FindRecordsByFilter(
GoodReadsQuoteCollectionName,
"id = {:id}",
"-created",
10,
0,
dbx.Params{"id": "id"},
)
if err != nil {
return nil, err
}
return records, err
}
func UpdateGoodReadsQuote(app *pocketbase.PocketBase, data map[string]any) error {
id := data["id"].(string)
if id == "" {
return nil
}
record, err := app.Dao().FindRecordById(GoodReadsQuoteCollectionName, id)
if err != nil {
return err
}
form := forms.NewRecordUpsert(app, record)
form.LoadData(data)
if err := form.Submit(); err != nil {
return err
}
return nil
}
func DeleteGoodReadsQuote(app *pocketbase.PocketBase, id string) error {
record, err := app.Dao().FindRecordById(GoodReadsQuoteCollectionName, id)
if err != nil {
return err
}
if err := app.Dao().DeleteRecord(record); err != nil {
return err
}
return nil
}crawlers/goodreads_quote.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82package crawlers
import (
"log"
"pocketbase-goodreads/repository"
"regexp"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"github.com/pocketbase/pocketbase"
)
var quoteSymbolsReg = regexp.MustCompile(`[“”]`)
var whiteSpaceReg = regexp.MustCompile(`\s+`)
func GoodReadsQuoteCrawler(app *pocketbase.PocketBase, c *colly.Collector, link string) {
c.Visit(link)
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
nextLink := e.Request.AbsoluteURL(link)
if QuoteLinkRegexp.MatchString(nextLink) {
// prevent duplicated crawl for the 1st quote page
if !FirstQuoteLinkPageRegexp.MatchString(nextLink) {
e.Request.Visit(link)
}
}
})
c.OnScraped(func(r *colly.Response) {
url := r.Request.URL.String()
log.Println(
"Requested: "+url,
"status", r.StatusCode,
"model", "colly:good-reads:quote",
)
app.Logger().Info(
"Requested: "+url,
"status", r.StatusCode,
"model", "colly:good-reads:quote",
)
})
c.OnError(func(r *colly.Response, err error) {
url := r.Request.URL.String()
log.Println(
"Failed to request: "+url,
"error", err,
"status", r.StatusCode,
"model", "colly:good-reads:quote",
)
app.Logger().Error(
"Failed to request: "+url,
"error", err,
"status", r.StatusCode,
"model", "colly:good-reads:quote",
)
})
c.OnHTML(".leftContainer", func(h *colly.HTMLElement) {
h.DOM.Find(`.quotes .quote .quoteDetails`).Each(func(i int, s *goquery.Selection) {
content := quoteSymbolsReg.ReplaceAllString(strings.TrimSpace(strings.Split(h.DOM.Find(".quoteText").Text(), "―")[0]), "")
tags := whiteSpaceReg.ReplaceAllString(strings.TrimSpace(strings.Replace(h.DOM.Find(".quoteFooter .left").Text(), "tags:", "", -1)), "")
author := strings.TrimSpace(strings.Replace(h.DOM.Find("span.authorOrTitle").Text(), ",", "", -1))
like, ok := strconv.Atoi(strings.TrimSpace(strings.Replace(s.Find(".quoteFooter .right a").Text(), "likes", "", -1)))
if ok != nil {
like = 0
}
q := map[string]interface{}{}
q["Content"] = content
q["Tags"] = tags
q["Like"] = like
q["Author"] = author
repository.CreateNewGoodReadsQuote(app, q)
})
})
}crawlers/goodreads.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98package crawlers
import (
"fmt"
"regexp"
"time"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/extensions"
"github.com/pocketbase/pocketbase"
"github.com/pocketbase/pocketbase/core"
)
var GenresLinkRegexp = regexp.MustCompile(`https://www\.goodreads\.com/genres/.+$`)
var ShelfLinkRegexp = regexp.MustCompile(`https://www\.goodreads\.com/shelf/.+$`)
var AuthorLinkRegexp = regexp.MustCompile(`(?m)https:\/\/www\.goodreads\.com\/author\/show\/\d+\.[[:alnum:]\-\._]+$`)
var BookLinkRegexp = regexp.MustCompile(`https://www\.goodreads\.com/book/show/[[:alnum:]\-\._]+$`)
var QuoteLinkRegexp = regexp.MustCompile(`https://www\.goodreads\.com/work/quotes/.+$`)
var AuthorAllBooksLinkRegexp = regexp.MustCompile(`https:\/\/www\.goodreads\.com\/author\/list\/.+$`)
var FirstQuoteLinkPageRegexp = regexp.MustCompile(`https://www\.goodreads\.com/work/quotes/[[:alnum:]\-\._]+\?page\=1$`)
// InitGoodReadsCrawler inits the goodreads crawler after pocketbase bootstrapped
func InitGoodReadsCrawler(app *pocketbase.PocketBase) {
app.OnAfterBootstrap().Add(func(e *core.BootstrapEvent) error {
go func() {
GoodReadsCrawler(app)
}()
return nil
})
}
func GoodReadsCrawler(app *pocketbase.PocketBase) {
c := colly.NewCollector(
colly.Async(),
colly.AllowedDomains("www.goodreads.com"),
colly.URLFilters(
BookLinkRegexp,
GenresLinkRegexp,
ShelfLinkRegexp,
QuoteLinkRegexp,
AuthorLinkRegexp,
AuthorAllBooksLinkRegexp,
),
)
c.SetRequestTimeout(10 * time.Second)
extensions.RandomUserAgent(c)
quoteCollector := c.Clone()
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
nextLink := e.Request.AbsoluteURL(link)
if QuoteLinkRegexp.MatchString(nextLink) {
// prevent duplicated crawl for the 1st quote page
if !FirstQuoteLinkPageRegexp.MatchString(nextLink) {
GoodReadsQuoteCrawler(app, quoteCollector, nextLink)
}
} else {
e.Request.Visit(link)
}
})
c.OnScraped(func(r *colly.Response) {
app.Logger().Info(
"Requested: "+r.Request.URL.String(),
"status", r.StatusCode,
"model", "colly:good-reads",
)
})
c.OnError(func(r *colly.Response, err error) {
app.Logger().Error(
"Failed to request: "+r.Request.URL.String(),
"error", err,
"status", r.StatusCode,
"model", "colly:good-reads",
)
})
// Set max Parallelism and introduce a Random Delay
c.Limit(&colly.LimitRule{
Parallelism: 1,
RandomDelay: 10 * time.Second,
DomainGlob: "https://www.goodreads.com/*",
})
// 仅爬取编号从 3 到 20 内存在的作者作为示例
codes := []int{3, 20}
for _, code := range codes {
c.Visit(fmt.Sprintf("https://www.goodreads.com/author/list/%d", code))
}
c.Wait()
}main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23package main
import (
"log"
"pocketbase-goodreads/config"
"pocketbase-goodreads/crawlers"
"github.com/pocketbase/pocketbase"
)
func main() {
app := pocketbase.New()
config.InitApp(app)
// 注册 Goodreads 爬虫服务
crawlers.InitGoodReadsCrawler(app)
if err := app.Start(); err != nil {
log.Fatal(err)
}
}重启 PocketBase:
go run . serve
最后
存在问题:
- 使用 Goroutine 将 Goodreads 爬虫服务挂载在 PocketBase 的
OnAfterBootstrap
钩子上,PocketBase 和爬虫同时运行时导致后台操作阻塞。
考虑到的解决方案:
不确定是否是 Goroutine 使用不正确的原因,继续使用协程需寻求 Go 层级上的解决方案。
「按需爬取」:爬虫服务不再挂载到 PocketBase 上,也不再跟随 PocketBase 服务实时启动,将提供 API 方式获取爬取指令后再「按需爬取」。如:按书籍作者则爬取书籍作者下所有书籍的摘录、按摘录分类爬取、按单本书籍爬取等。