Getting started
Installation
mkdir myapp && cd $_
go mod init myapp
go get -u github.com/gocolly/colly/v2
Get Started
main.go
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Instantiate default collector
c := colly.NewCollector()
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
// Called if error occured during the request
c.OnError(func(_ *colly.Response, err error) {
log.Println("Something went wrong:", err)
})
// Called after response headers received
c.OnResponseHeaders(func(r *colly.Response) {
fmt.Println("Visited", r.Request.URL)
})
// Called after response received
c.OnResponse(func(r *colly.Response) {
fmt.Println("Visited", r.Request.URL)
})
// Called right after `OnResponse` if the received content is HTML
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML("tr td:nth-of-type(1)", func(e *colly.HTMLElement) {
fmt.Println("First column of a table row:", e.Text)
})
// Called right after `OnHTML` if the received content is HTML or XML
c.OnXML("//h1", func(e *colly.XMLElement) {
fmt.Println(e.Text)
})
// Called after `OnXML` callbacks
c.OnScraped(func(r *colly.Response) {
fmt.Println("Finished", r.Request.URL)
})
// Start scraping on https://hackerspaces.org
c.Visit("https://hackerspaces.org/")
}
type Collector
colly.go
type Collector struct {
// UserAgent is the User-Agent string used by HTTP requests
UserAgent string
// Custom headers for the request
Headers *http.Header
// MaxDepth limits the recursion depth of visited URLs.
// Set it to 0 for infinite recursion (default).
MaxDepth int
// AllowedDomains is a domain whitelist.
// Leave it blank to allow any domains to be visited
AllowedDomains []string
// DisallowedDomains is a domain blacklist.
DisallowedDomains []string
// DisallowedURLFilters is a list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the
// request will be stopped. DisallowedURLFilters will
// be evaluated before URLFilters
// Leave it blank to allow any URLs to be visited
DisallowedURLFilters []*regexp.Regexp
// Leave it blank to allow any URLs to be visited
URLFilters []*regexp.Regexp
// AllowURLRevisit allows multiple downloads of the same URL
AllowURLRevisit bool
// MaxBodySize is the limit of the retrieved response body in bytes.
// 0 means unlimited.
// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
MaxBodySize int
// CacheDir specifies a location where GET requests are cached as files.
// When it's not defined, caching is disabled.
CacheDir string
// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
// the target host's robots.txt file. See http://www.robotstxt.org/ for more
// information.
IgnoreRobotsTxt bool
// Async turns on asynchronous network communication. Use Collector.Wait() to
// be sure all requests have been finished.
Async bool
// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
// By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
// to true to enable it.
ParseHTTPErrorResponse bool
// ID is the unique identifier of a collector
ID uint32
// DetectCharset can enable character encoding detection for non-utf8 response bodies
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
DetectCharset bool
// CheckHead performs a HEAD request before every GET to pre-validate the response
CheckHead bool
// TraceHTTP enables capturing and reporting request performance for crawler tuning.
// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
TraceHTTP bool
// Context is the context that will be used for HTTP requests. You can set this
// to support clean cancellation of scraping.
Context context.Context
// MaxRequests limit the number of requests done by the instance.
// Set it to 0 for infinite requests (default).
MaxRequests uint32
// contains filtered or unexported fields
}
See colly.go
Configuration
Create Collector with Default Settings
main.go
//...
c := colly.NewCollector()
//...
Overwrite Default Settings
main.go
//...
import (
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/extensions"
)
c := colly.NewCollector(
colly.UserAgent("xy"),
colly.AllowURLRevisit(),
)
extensions.RandomUserAgent(c)
extensions.Referer(c)
// or
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
func RandomString() string {
b := make([]byte, rand.Intn(10)+10)
for i := range b {
b[i] = letterBytes[rand.Intn(len(letterBytes))]
}
return string(b)
}
c2 := colly.NewCollector()
c2.AllowURLRevisit = true
// changes User-Agent on every request
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", RandomString())
})
//...
Configuration can be changed at any point of a scraping job by overwriting the attributes of the collectors.
Configuration via Environment Variables
.env
# comma separated list of domains
COLLY_ALLOWED_DOMAINS=
# string
COLLY_CACHE_DIR=
# y/n
COLLY_DETECT_CHARSET=
# y/n
COLLY_DISABLE_COOKIES=
# comma separated list of domains
COLLY_DISALLOWED_DOMAINS=
# y/n
COLLY_IGNORE_ROBOTSTXT=
# y/n
COLLY_FOLLOW_REDIRECTS=
# int
COLLY_MAX_BODY_SIZE=
# int - 0 means infinite
COLLY_MAX_DEPTH=
# y/n
COLLY_PARSE_HTTP_ERROR_RESPONSE=
# string
COLLY_USER_AGENT=
HTTP Configuration
main.go
// ...
c := colly.NewCollector()
c.WithTransport(&http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: true,
}).DialContext,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
}
// ...
See more http client, and HTTP roundtripper.
Best Practices
Debugging
main.go
import (
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
func main() {
c := colly.NewCollector(colly.Debugger(&debug.LogDebugger{}))
// [..]
}
You can create any kind of custom debugger by implementing the debug.Debugger interface. A good example is LogDebugger.
Distributed Scraping
main.go
package main
import (
"bytes"
"log"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/proxy"
)
func main() {
c := colly.NewCollector()
if p, err := proxy.RoundRobinProxySwitcher(
"socks5://127.0.0.1:1337",
"socks5://127.0.0.1:1338",
"http://127.0.0.1:8080",
); err == nil {
c.SetProxyFunc(p)
}
c.OnResponse(func(r *colly.Response) {
log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
})
for i := 0; i < 5; i++ {
c.Visit("https://httpbin.org/ip")
}
}
Storage
main.go
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"github.com/gocolly/redisstorage"
)
func main() {
urls := []string{
"http://httpbin.org/",
"http://httpbin.org/ip",
"http://httpbin.org/cookies/set?a=b&c=d",
"http://httpbin.org/cookies",
}
c := colly.NewCollector()
// create the redis storage
storage := &redisstorage.Storage{
Address: "127.0.0.1:6379",
Password: "",
DB: 0,
Prefix: "httpbin_test",
}
// add storage to the collector
err := c.SetStorage(storage)
if err != nil {
panic(err)
}
// delete previous data from storage
if err := storage.Clear(); err != nil {
log.Fatal(err)
}
// close redis client
defer storage.Client.Close()
// create a new request queue with redis storage backend
q, _ := queue.New(2, storage)
c.OnResponse(func(r *colly.Response) {
log.Println("Cookies:", c.Cookies(r.Request.URL.String()))
})
// add URLs to the queue
for _, u := range urls {
q.AddURL(u)
}
// consume requests
q.Run(c)
}
In-Memory storage is the default storage in Colly, use collector.SetStorage() to override. More.
Multiple Collectors
main.go
// ...
c := colly.NewCollector(
colly.UserAgent("myUserAgent"),
colly.AllowedDomains("foo.com", "bar.com"),
)
// `Clone()` duplicates a collector with identical configuration but without the attached callbacks.
// Custom User-Agent and allowed domains are cloned to c2
c2 := c.Clone()
c.OnResponse(func(r *colly.Response) {
r.Ctx.Put(r.Headers.Get("Custom-Header"))
// Use collector's `Request()` function to be able to share context with other collectors.
c2.Request("GET", "https://foo.com/", nil, r.Ctx, nil)
})
// ...
Usage advised: complex task. such as, one parses the list views and handles paging and the other one collects details.
Disable or Limit Connection Keep-Alive
main.go
// ...
c := colly.NewCollector()
c.WithTransport(&http.Transport{
DisableKeepAlives: true,
})
// ...
Colly uses HTTP keep-alive to enhance scraping speed. It requires open file descriptors, so max-fd limit can be easily reached with long running jobs.
Parallel
main.go
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Instantiate default collector
c := colly.NewCollector(
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
colly.MaxDepth(2),
colly.Async(true),
)
// Limit the maximum parallelism to 2
// This is necessary if the goroutines are dynamically
// created to control the limit of simultaneous requests.
//
// Parallelism can be controlled also by spawning fixed
// number of go routines.
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Println(link)
// Visit link found on page on a new thread
e.Request.Visit(link)
})
// Start scraping on https://en.wikipedia.org
c.Visit("https://en.wikipedia.org/")
// Wait until threads are finished
c.Wait()
}
Queue
main.go
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
)
func main() {
url := "https://httpbin.org/delay/1"
// Instantiate default collector
c := colly.NewCollector()
// create a request queue with 2 consumer threads
q, _ := queue.New(
2, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
)
c.OnRequest(func(r *colly.Request) {
fmt.Println("visiting", r.URL)
})
for i := 0; i < 5; i++ {
// Add URLs to the queue
q.AddURL(fmt.Sprintf("%s?n=%d", url, i))
}
// Consume URLs
q.Run(c)
}
Rate Limit
main.go
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
func main() {
url := "https://httpbin.org/delay/2"
// Instantiate default collector
c := colly.NewCollector(
// Turn on asynchronous requests
colly.Async(true),
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
)
// Limit the number of threads started by colly to two
// when visiting links which domains' matches "*httpbin.*" glob
c.Limit(&colly.LimitRule{
DomainGlob: "*httpbin.*",
Parallelism: 2,
//Delay: 5 * time.Second,
//RandomDelay: 5 * time.Second, // fixed delay or random delay
})
// Start scraping in five threads on https://httpbin.org/delay/2
for i := 0; i < 5; i++ {
c.Visit(fmt.Sprintf("%s?n=%d", url, i))
}
// Wait until threads are finished
c.Wait()
}
Request Context
main.go
// ...
// Before making a request put the URL with
// the key of "url" into the context of the request
c.OnRequest(func(r *colly.Request) {
r.Ctx.Put("url", r.URL.String())
})
// After making a request get "url" from
// the context of the request
c.OnResponse(func(r *colly.Response) {
fmt.Println(r.Ctx.Get("url"))
})
// ...
URL Filter
main.go
// ...
// Visit only root url and urls which start with "e" or "h" on httpbin.org
c := colly.NewCollector(
// Visit only root url and urls which start with "e" or "h" on httpbin.org
colly.URLFilters(
regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
regexp.MustCompile("http://httpbin\\.org/h.+"),
),
)
c.Visit("http://httpbin.org/")
// ...