Documentation
¶
Index ¶
- func ReplayGetRequests(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (key string, cache bool)
- func WithAutoThrottleDelayBounds(minDelay, maxDelay time.Duration) autoThrottleOption
- func WithAutoThrottleStartDelay(delay time.Duration) autoThrottleOption
- func WithAutoThrottleTargetConcurrency(concurrency int) autoThrottleOption
- type AllowedDomains
- type AutoThrottle
- type Cookies
- type Dedupe
- type FSReplayStore
- type GobMetaEncoder
- type Headers
- type MemoryReplayStore
- type MetaEncoder
- type Replay
- type ReplayHandler
- type ReplayStore
- type Throttle
- type ThrottleHandler
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ReplayGetRequests ¶
func ReplayGetRequests(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (key string, cache bool)
ReplayGetRequests is a ReplayHandler that replays all GET requests and uses their normalized url as the request key.
func WithAutoThrottleDelayBounds ¶
WithAutoThrottleDelayBounds defines minimum and maximum delay before making a request.
func WithAutoThrottleStartDelay ¶
WithAutoThrottleStartDelay defines the starting delay for any given request.
func WithAutoThrottleTargetConcurrency ¶
func WithAutoThrottleTargetConcurrency(concurrency int) autoThrottleOption
WithAutoThrottleTargetConcurrency defines the target number of requests that should hit the server at the same time.
Types ¶
type AllowedDomains ¶
type AllowedDomains struct {
// contains filtered or unexported fields
}
AllowedDomains is a downloader.DownloaderMiddleware that limits the domains of requests and responses.
func NewAllowedDomains ¶
func NewAllowedDomains(forRequests, forResponses []string) AllowedDomains
NewAllowedDomains creates an AllowedDomains middleware.
- if forRequests is nil or empty, it will allow all requests.
- if forResponses is nil or empty, it will allow all responses.
You can use wildcards (*) in the domains. [documentation](https://github.com/gobwas/glob)
func (AllowedDomains) HandleRequest ¶
func (p AllowedDomains) HandleRequest(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (*downloader.Response, error)
func (AllowedDomains) HandleResponse ¶
func (p AllowedDomains) HandleResponse( ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata, ) error
type AutoThrottle ¶
type AutoThrottle struct {
// contains filtered or unexported fields
}
AutoThrottle automatically limits scraping speed in order to lessen the burden on websites, avoid rate-limiting, and decrease overall scraping time.
- Based on scrapy's AutoThrottle [algorithm](https://docs.scrapy.org/en/latest/topics/autothrottle.html#throttling-algorithm).
func NewAutoThrottle ¶
func NewAutoThrottle(options ...autoThrottleOption) *AutoThrottle
func (*AutoThrottle) HandleResponse ¶
func (a *AutoThrottle) HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata)
func (*AutoThrottle) Throttle ¶
func (a *AutoThrottle) Throttle(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) time.Duration
type Cookies ¶
type Cookies struct {
// contains filtered or unexported fields
}
Cookies persists cookies across requests.
func NewCookies ¶
func (Cookies) HandleRequest ¶
func (c Cookies) HandleRequest(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (*downloader.Response, error)
func (Cookies) HandleResponse ¶
func (c Cookies) HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata) error
type Dedupe ¶
type Dedupe struct {
// contains filtered or unexported fields
}
Dedupe drops duplicate GET requests, requests are differentiated by their normalized url.
func (*Dedupe) HandleRequest ¶
func (d *Dedupe) HandleRequest(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (*downloader.Response, error)
func (*Dedupe) HandleResponse ¶
func (d *Dedupe) HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata) error
type FSReplayStore ¶
type FSReplayStore struct {
// contains filtered or unexported fields
}
FSReplayStore implements CacheStore with the local filesystem.
func NewFSReplayStore ¶
func NewFSReplayStore(dir string, menc MetaEncoder) FSReplayStore
func (FSReplayStore) Get ¶
func (s FSReplayStore) Get(ctx context.Context, session, id string) *downloader.Response
func (FSReplayStore) Set ¶
func (s FSReplayStore) Set(ctx context.Context, session, id string, res *downloader.Response)
type GobMetaEncoder ¶
type GobMetaEncoder struct{}
GobMetaEncoder implements MetaEncoder using encoding/gob.
func NewGobMetaEncoder ¶
func NewGobMetaEncoder(types ...any) GobMetaEncoder
NewGobMetaEncoder creates a MetaEncoder that uses encoding/gob, all the types you expect to be in downloader.Request.Meta should be passed as parameters to this function to be registered with encoding/gob.
type Headers ¶
type Headers struct {
// contains filtered or unexported fields
}
Headers overrides the given headers with the given header values in each request.
func NewHeaders ¶
func (*Headers) HandleRequest ¶
func (h *Headers) HandleRequest(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (*downloader.Response, error)
func (*Headers) HandleResponse ¶
func (h *Headers) HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata) error
type MemoryReplayStore ¶
type MemoryReplayStore struct {
// contains filtered or unexported fields
}
MemoryReplayStore implements CacheStore with an in-memory sync.Map
func NewMemoryReplayStore ¶
func NewMemoryReplayStore() *MemoryReplayStore
func (*MemoryReplayStore) Get ¶
func (s *MemoryReplayStore) Get(ctx context.Context, session, id string) *downloader.Response
func (*MemoryReplayStore) Has ¶
func (s *MemoryReplayStore) Has(ctx context.Context, session, id string) bool
func (*MemoryReplayStore) Set ¶
func (s *MemoryReplayStore) Set(ctx context.Context, session, id string, res *downloader.Response)
type MetaEncoder ¶
type MetaEncoder interface { // Marshal turns an given metadata element into a byteslice. // // Marshal can return a zero-length byteslice to indicate that the given metadata element should not be serialized. Marshal(meta any) ([]byte, error) // Unmarshal returns a given metadata element from a byteslice. Unmarshal(buff []byte) (any, error) }
MetaEncoder takes in an any value used as an element in request metadata and serializes it.
type Replay ¶
type Replay struct {
// contains filtered or unexported fields
}
Replay provides response replay (and caching) functionality.
func NewReplay ¶
func NewReplay(sessionId string, store ReplayStore, handler ReplayHandler) Replay
func (Replay) HandleRequest ¶
func (c Replay) HandleRequest(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (*downloader.Response, error)
func (Replay) HandleResponse ¶
func (c Replay) HandleResponse( ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata, ) error
type ReplayHandler ¶
type ReplayHandler = func(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (key string, replay bool)
ReplayHandler determines what requests to replay and what the unique key for the request should be.
type ReplayStore ¶
type ReplayStore interface { Has(ctx context.Context, session, id string) bool // Get should return nil if a stored request with the given id does not yet exist. Get(ctx context.Context, session, id string) *downloader.Response Set(ctx context.Context, session, id string, res *downloader.Response) }
ReplayStore is an abstract interface various cache storing mechanism can implement to be able to be used in a Cache.
type Throttle ¶
type Throttle struct {
// contains filtered or unexported fields
}
Throttle throttles crawling speed to ease load on website servers.
Note: This is partly based on scrapy's AutoThrottle(https://docs.scrapy.org/en/latest/topics/autothrottle.html) extension.
func NewThrottle ¶
func NewThrottle(handler ThrottleHandler) Throttle
func (Throttle) HandleRequest ¶
func (t Throttle) HandleRequest(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (*downloader.Response, error)
func (Throttle) HandleResponse ¶
func (t Throttle) HandleResponse( ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata, ) error
type ThrottleHandler ¶
type ThrottleHandler interface { // Throttle returns an integer that represents the amount of time to wait before making the request. Throttle(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (delay time.Duration) // HandleResponse will be called with all the responses returned by the HTTP client. HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata) }
ThrottleHandler