Colly cheatsheet

Getting started

Installation

mkdir myapp && cd $_
go mod init myapp
go get -u github.com/gocolly/colly/v2

Get Started

main.go

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
)

func main() {
    // Instantiate default collector
    c := colly.NewCollector()

    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    // Called if error occured during the request
    c.OnError(func(_ *colly.Response, err error) {
        log.Println("Something went wrong:", err)
    })

    // Called after response headers received
    c.OnResponseHeaders(func(r *colly.Response) {
        fmt.Println("Visited", r.Request.URL)
    })

    // Called after response received
    c.OnResponse(func(r *colly.Response) {
        fmt.Println("Visited", r.Request.URL)
    })

    // Called right after `OnResponse` if the received content is HTML
    // On every a element which has href attribute call callback
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
        e.Request.Visit(e.Attr("href"))
    })

    c.OnHTML("tr td:nth-of-type(1)", func(e *colly.HTMLElement) {
        fmt.Println("First column of a table row:", e.Text)
    })

    // Called right after `OnHTML` if the received content is HTML or XML
    c.OnXML("//h1", func(e *colly.XMLElement) {
        fmt.Println(e.Text)
    })

    // Called after `OnXML` callbacks
    c.OnScraped(func(r *colly.Response) {
        fmt.Println("Finished", r.Request.URL)
    })

    // Start scraping on https://hackerspaces.org
	c.Visit("https://hackerspaces.org/")
}

type Collector

colly.go

type Collector struct {
	// UserAgent is the User-Agent string used by HTTP requests
	UserAgent string
	// Custom headers for the request
	Headers *http.Header
	// MaxDepth limits the recursion depth of visited URLs.
	// Set it to 0 for infinite recursion (default).
	MaxDepth int
	// AllowedDomains is a domain whitelist.
	// Leave it blank to allow any domains to be visited
	AllowedDomains []string
	// DisallowedDomains is a domain blacklist.
	DisallowedDomains []string
	// DisallowedURLFilters is a list of regular expressions which restricts
	// visiting URLs. If any of the rules matches to a URL the
	// request will be stopped. DisallowedURLFilters will
	// be evaluated before URLFilters
	// Leave it blank to allow any URLs to be visited
	DisallowedURLFilters []*regexp.Regexp

	// Leave it blank to allow any URLs to be visited
	URLFilters []*regexp.Regexp

	// AllowURLRevisit allows multiple downloads of the same URL
	AllowURLRevisit bool
	// MaxBodySize is the limit of the retrieved response body in bytes.
	// 0 means unlimited.
	// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
	MaxBodySize int
	// CacheDir specifies a location where GET requests are cached as files.
	// When it's not defined, caching is disabled.
	CacheDir string
	// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
	// the target host's robots.txt file.  See http://www.robotstxt.org/ for more
	// information.
	IgnoreRobotsTxt bool
	// Async turns on asynchronous network communication. Use Collector.Wait() to
	// be sure all requests have been finished.
	Async bool
	// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
	// By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
	// to true to enable it.
	ParseHTTPErrorResponse bool
	// ID is the unique identifier of a collector
	ID uint32
	// DetectCharset can enable character encoding detection for non-utf8 response bodies
	// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
	DetectCharset bool

	// CheckHead performs a HEAD request before every GET to pre-validate the response
	CheckHead bool
	// TraceHTTP enables capturing and reporting request performance for crawler tuning.
	// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
	TraceHTTP bool
	// Context is the context that will be used for HTTP requests. You can set this
	// to support clean cancellation of scraping.
	Context context.Context
	// MaxRequests limit the number of requests done by the instance.
	// Set it to 0 for infinite requests (default).
	MaxRequests uint32
	// contains filtered or unexported fields
}

See colly.go

Configuration

Create Collector with Default Settings

main.go

//...
c := colly.NewCollector()
//...

Overwrite Default Settings

main.go

//...
import (
    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/extensions"
)
c := colly.NewCollector(
	colly.UserAgent("xy"),
	colly.AllowURLRevisit(),
)
extensions.RandomUserAgent(c)
extensions.Referer(c)

// or

const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

func RandomString() string {
	b := make([]byte, rand.Intn(10)+10)
	for i := range b {
		b[i] = letterBytes[rand.Intn(len(letterBytes))]
	}
	return string(b)
}
c2 := colly.NewCollector()
c2.AllowURLRevisit = true

// changes User-Agent on every request
c.OnRequest(func(r *colly.Request) {
	r.Headers.Set("User-Agent", RandomString())
})
//...

Configuration can be changed at any point of a scraping job by overwriting the attributes of the collectors.

Configuration via Environment Variables

.env

# comma separated list of domains
COLLY_ALLOWED_DOMAINS=
# string
COLLY_CACHE_DIR=
# y/n
COLLY_DETECT_CHARSET=
# y/n
COLLY_DISABLE_COOKIES=
# comma separated list of domains
COLLY_DISALLOWED_DOMAINS=
# y/n
COLLY_IGNORE_ROBOTSTXT=
# y/n
COLLY_FOLLOW_REDIRECTS=
# int
COLLY_MAX_BODY_SIZE=
# int - 0 means infinite
COLLY_MAX_DEPTH=
# y/n
COLLY_PARSE_HTTP_ERROR_RESPONSE=
# string
COLLY_USER_AGENT=

HTTP Configuration

main.go

// ...
c := colly.NewCollector()
c.WithTransport(&http.Transport{
	Proxy: http.ProxyFromEnvironment,
	DialContext: (&net.Dialer{
		Timeout:   30 * time.Second,
		KeepAlive: 30 * time.Second,
		DualStack: true,
	}).DialContext,
	MaxIdleConns:          100,
	IdleConnTimeout:       90 * time.Second,
	TLSHandshakeTimeout:   10 * time.Second,
	ExpectContinueTimeout: 1 * time.Second,
}
// ...

See more http client, and HTTP roundtripper.

Best Practices

Debugging

main.go

import (
	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/debug"
)

func main() {
    c := colly.NewCollector(colly.Debugger(&debug.LogDebugger{}))
    // [..]
}

You can create any kind of custom debugger by implementing the debug.Debugger interface. A good example is LogDebugger.

Distributed Scraping

main.go

package main

import (
    "bytes"
	"log"

	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/proxy"
)

func main() {
	c := colly.NewCollector()

	if p, err := proxy.RoundRobinProxySwitcher(
		"socks5://127.0.0.1:1337",
		"socks5://127.0.0.1:1338",
		"http://127.0.0.1:8080",
	); err == nil {
		c.SetProxyFunc(p)
	}

	c.OnResponse(func(r *colly.Response) {
		log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
	})

	for i := 0; i < 5; i++ {
		c.Visit("https://httpbin.org/ip")
	}
}

Storage

main.go

package main

import (
  "fmt"

  "github.com/gocolly/colly/v2"
  "github.com/gocolly/redisstorage"
)

func main() {
    urls := []string{
		"http://httpbin.org/",
		"http://httpbin.org/ip",
		"http://httpbin.org/cookies/set?a=b&c=d",
		"http://httpbin.org/cookies",
	}

	c := colly.NewCollector()

	// create the redis storage
	storage := &redisstorage.Storage{
		Address:  "127.0.0.1:6379",
		Password: "",
		DB:       0,
		Prefix:   "httpbin_test",
	}

	// add storage to the collector
	err := c.SetStorage(storage)
	if err != nil {
		panic(err)
	}

	// delete previous data from storage
	if err := storage.Clear(); err != nil {
		log.Fatal(err)
	}

	// close redis client
	defer storage.Client.Close()

	// create a new request queue with redis storage backend
	q, _ := queue.New(2, storage)

	c.OnResponse(func(r *colly.Response) {
		log.Println("Cookies:", c.Cookies(r.Request.URL.String()))
	})

	// add URLs to the queue
	for _, u := range urls {
		q.AddURL(u)
	}
	// consume requests
	q.Run(c)
}

In-Memory storage is the default storage in Colly, use collector.SetStorage() to override. More.

Multiple Collectors

main.go

// ...
c := colly.NewCollector(
	colly.UserAgent("myUserAgent"),
	colly.AllowedDomains("foo.com", "bar.com"),
)

// `Clone()` duplicates a collector with identical configuration but without the attached callbacks.
// Custom User-Agent and allowed domains are cloned to c2
c2 := c.Clone()

c.OnResponse(func(r *colly.Response) {
	r.Ctx.Put(r.Headers.Get("Custom-Header"))
    // Use collector's `Request()` function to be able to share context with other collectors.
	c2.Request("GET", "https://foo.com/", nil, r.Ctx, nil)
})
// ...

Usage advised: complex task. such as, one parses the list views and handles paging and the other one collects details.

Disable or Limit Connection Keep-Alive

main.go

// ...
c := colly.NewCollector()
c.WithTransport(&http.Transport{
    DisableKeepAlives: true,
})
// ...

Colly uses HTTP keep-alive to enhance scraping speed. It requires open file descriptors, so max-fd limit can be easily reached with long running jobs.

Parallel

main.go

package main

import (
	"fmt"

	"github.com/gocolly/colly/v2"
)

func main() {
	// Instantiate default collector
	c := colly.NewCollector(
		// MaxDepth is 2, so only the links on the scraped page
		// and links on those pages are visited
		colly.MaxDepth(2),
		colly.Async(true),
	)

	// Limit the maximum parallelism to 2
	// This is necessary if the goroutines are dynamically
	// created to control the limit of simultaneous requests.
	//
	// Parallelism can be controlled also by spawning fixed
	// number of go routines.
	c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})

	// On every a element which has href attribute call callback
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		// Print link
		fmt.Println(link)
		// Visit link found on page on a new thread
		e.Request.Visit(link)
	})

	// Start scraping on https://en.wikipedia.org
	c.Visit("https://en.wikipedia.org/")
	// Wait until threads are finished
	c.Wait()
}

Queue

main.go

package main

import (
	"fmt"

	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/queue"
)

func main() {
	url := "https://httpbin.org/delay/1"

	// Instantiate default collector
	c := colly.NewCollector()

	// create a request queue with 2 consumer threads
	q, _ := queue.New(
		2, // Number of consumer threads
		&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
	)

	c.OnRequest(func(r *colly.Request) {
		fmt.Println("visiting", r.URL)
	})

	for i := 0; i < 5; i++ {
		// Add URLs to the queue
		q.AddURL(fmt.Sprintf("%s?n=%d", url, i))
	}
	// Consume URLs
	q.Run(c)

}

Rate Limit

main.go

package main

import (
	"fmt"

	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/debug"
)

func main() {
	url := "https://httpbin.org/delay/2"

	// Instantiate default collector
	c := colly.NewCollector(
		// Turn on asynchronous requests
		colly.Async(true),
		// Attach a debugger to the collector
		colly.Debugger(&debug.LogDebugger{}),
	)

	// Limit the number of threads started by colly to two
	// when visiting links which domains' matches "*httpbin.*" glob
	c.Limit(&colly.LimitRule{
		DomainGlob:  "*httpbin.*",
		Parallelism: 2,
		//Delay:      5 * time.Second,
        //RandomDelay: 5 * time.Second, // fixed delay or random delay
	})

	// Start scraping in five threads on https://httpbin.org/delay/2
	for i := 0; i < 5; i++ {
		c.Visit(fmt.Sprintf("%s?n=%d", url, i))
	}
	// Wait until threads are finished
	c.Wait()
}

Request Context

main.go

// ...
// Before making a request put the URL with
// the key of "url" into the context of the request
c.OnRequest(func(r *colly.Request) {
    r.Ctx.Put("url", r.URL.String())
})

// After making a request get "url" from
// the context of the request
c.OnResponse(func(r *colly.Response) {
    fmt.Println(r.Ctx.Get("url"))
})
// ...

URL Filter

main.go

// ...
// Visit only root url and urls which start with "e" or "h" on httpbin.org
c := colly.NewCollector(
    // Visit only root url and urls which start with "e" or "h" on httpbin.org
    colly.URLFilters(
        regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
        regexp.MustCompile("http://httpbin\\.org/h.+"),
    ),
)

c.Visit("http://httpbin.org/")
// ...

References

Official resources

Repo (github.com)
Docs (go-colly.org)