middleware

package
v0.0.0-...-5ebff26 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 11, 2025 License: MIT Imports: 17 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ReplayGetRequests

func ReplayGetRequests(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (key string, cache bool)

ReplayGetRequests is a ReplayHandler that replays all GET requests and uses their normalized url as the request key.

func WithAutoThrottleDelayBounds

func WithAutoThrottleDelayBounds(minDelay, maxDelay time.Duration) autoThrottleOption

WithAutoThrottleDelayBounds defines minimum and maximum delay before making a request.

func WithAutoThrottleStartDelay

func WithAutoThrottleStartDelay(delay time.Duration) autoThrottleOption

WithAutoThrottleStartDelay defines the starting delay for any given request.

func WithAutoThrottleTargetConcurrency

func WithAutoThrottleTargetConcurrency(concurrency int) autoThrottleOption

WithAutoThrottleTargetConcurrency defines the target number of requests that should hit the server at the same time.

Types

type AllowedDomains

type AllowedDomains struct {
	// contains filtered or unexported fields
}

AllowedDomains is a downloader.DownloaderMiddleware that limits the domains of requests and responses.

func NewAllowedDomains

func NewAllowedDomains(forRequests, forResponses []string) AllowedDomains

NewAllowedDomains creates an AllowedDomains middleware.

  • if forRequests is nil or empty, it will allow all requests.
  • if forResponses is nil or empty, it will allow all responses.

You can use wildcards (*) in the domains. [documentation](https://github.com/gobwas/glob)

func (AllowedDomains) HandleRequest

func (AllowedDomains) HandleResponse

func (p AllowedDomains) HandleResponse(
	ctx context.Context,
	res *downloader.Response,
	meta downloader.ResponseMetadata,
) error

type AutoThrottle

type AutoThrottle struct {
	// contains filtered or unexported fields
}

AutoThrottle automatically limits scraping speed in order to lessen the burden on websites, avoid rate-limiting, and decrease overall scraping time.

func NewAutoThrottle

func NewAutoThrottle(options ...autoThrottleOption) *AutoThrottle

func (*AutoThrottle) HandleResponse

func (a *AutoThrottle) HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata)

func (*AutoThrottle) Throttle

type Cookies

type Cookies struct {
	// contains filtered or unexported fields
}

Cookies persists cookies across requests.

func NewCookies

func NewCookies(jar *cookiejar.Jar) Cookies

func (Cookies) HandleRequest

func (Cookies) HandleResponse

func (c Cookies) HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata) error

type Dedupe

type Dedupe struct {
	// contains filtered or unexported fields
}

Dedupe drops duplicate GET requests, requests are differentiated by their normalized url.

func NewDedupe

func NewDedupe() *Dedupe

func (*Dedupe) HandleRequest

func (*Dedupe) HandleResponse

func (d *Dedupe) HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata) error

type FSReplayStore

type FSReplayStore struct {
	// contains filtered or unexported fields
}

FSReplayStore implements CacheStore with the local filesystem.

func NewFSReplayStore

func NewFSReplayStore(dir string, menc MetaEncoder) FSReplayStore

func (FSReplayStore) Get

func (s FSReplayStore) Get(ctx context.Context, session, id string) *downloader.Response

func (FSReplayStore) Has

func (s FSReplayStore) Has(ctx context.Context, session, id string) bool

func (FSReplayStore) Set

func (s FSReplayStore) Set(ctx context.Context, session, id string, res *downloader.Response)

type GobMetaEncoder

type GobMetaEncoder struct{}

GobMetaEncoder implements MetaEncoder using encoding/gob.

func NewGobMetaEncoder

func NewGobMetaEncoder(types ...any) GobMetaEncoder

NewGobMetaEncoder creates a MetaEncoder that uses encoding/gob, all the types you expect to be in downloader.Request.Meta should be passed as parameters to this function to be registered with encoding/gob.

func (GobMetaEncoder) Marshal

func (GobMetaEncoder) Marshal(meta any) ([]byte, error)

func (GobMetaEncoder) Unmarshal

func (GobMetaEncoder) Unmarshal(buff []byte) (any, error)

type Headers

type Headers struct {
	// contains filtered or unexported fields
}

Headers overrides the given headers with the given header values in each request.

func NewHeaders

func NewHeaders(headers http.Header) *Headers

func (*Headers) HandleRequest

func (*Headers) HandleResponse

func (h *Headers) HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata) error

type MemoryReplayStore

type MemoryReplayStore struct {
	// contains filtered or unexported fields
}

MemoryReplayStore implements CacheStore with an in-memory sync.Map

func NewMemoryReplayStore

func NewMemoryReplayStore() *MemoryReplayStore

func (*MemoryReplayStore) Get

func (s *MemoryReplayStore) Get(ctx context.Context, session, id string) *downloader.Response

func (*MemoryReplayStore) Has

func (s *MemoryReplayStore) Has(ctx context.Context, session, id string) bool

func (*MemoryReplayStore) Set

func (s *MemoryReplayStore) Set(ctx context.Context, session, id string, res *downloader.Response)

type MetaEncoder

type MetaEncoder interface {
	// Marshal turns an given metadata element into a byteslice.
	//
	// Marshal can return a zero-length byteslice to indicate that the given metadata element should not be serialized.
	Marshal(meta any) ([]byte, error)

	// Unmarshal returns a given metadata element from a byteslice.
	Unmarshal(buff []byte) (any, error)
}

MetaEncoder takes in an any value used as an element in request metadata and serializes it.

type Replay

type Replay struct {
	// contains filtered or unexported fields
}

Replay provides response replay (and caching) functionality.

func NewReplay

func NewReplay(sessionId string, store ReplayStore, handler ReplayHandler) Replay

func (Replay) HandleRequest

func (Replay) HandleResponse

func (c Replay) HandleResponse(
	ctx context.Context,
	res *downloader.Response,
	meta downloader.ResponseMetadata,
) error

type ReplayHandler

type ReplayHandler = func(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (key string, replay bool)

ReplayHandler determines what requests to replay and what the unique key for the request should be.

type ReplayStore

type ReplayStore interface {
	Has(ctx context.Context, session, id string) bool
	// Get should return nil if a stored request with the given id does not yet exist.
	Get(ctx context.Context, session, id string) *downloader.Response
	Set(ctx context.Context, session, id string, res *downloader.Response)
}

ReplayStore is an abstract interface various cache storing mechanism can implement to be able to be used in a Cache.

type Throttle

type Throttle struct {
	// contains filtered or unexported fields
}

Throttle throttles crawling speed to ease load on website servers.

Note: This is partly based on scrapy's AutoThrottle(https://docs.scrapy.org/en/latest/topics/autothrottle.html) extension.

func NewThrottle

func NewThrottle(handler ThrottleHandler) Throttle

func (Throttle) HandleRequest

func (Throttle) HandleResponse

func (t Throttle) HandleResponse(
	ctx context.Context,
	res *downloader.Response,
	meta downloader.ResponseMetadata,
) error

type ThrottleHandler

type ThrottleHandler interface {
	// Throttle returns an integer that represents the amount of time to wait before making the request.
	Throttle(ctx context.Context, req *downloader.Request, meta downloader.RequestMetadata) (delay time.Duration)
	// HandleResponse will be called with all the responses returned by the HTTP client.
	HandleResponse(ctx context.Context, res *downloader.Response, meta downloader.ResponseMetadata)
}

ThrottleHandler

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL