Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
Types ¶
type Config ¶
type Config struct {
// The database, DB or the root if DB is nil.
// DB crawldatabase.Database[Page]
DBopener func(logger *slog.Logger, base string, logStatistics bool) ([]*url.URL, *crawldatabase.Database[Page], error)
// The base path of the database.
// Argument of the DBopener.
DBbase string
// Root URL to begin to read
Input []*url.URL
// Filter by URL or by the page (for exemple by the language).
// Return true to strike the page.
// The file: "/robots.txt" and "/favicon.ico" are not tested.
FilterURL []func(*url.URL) bool
FilterPage []func(*htmlnode.Root) bool
// The max size of the html page.
// 15M for Google https://developers.google.com/search/docs/crawling-indexing/googlebot#how-googlebot-accesses-your-site
MaxLength int64
// Maximum of crawl goroutine
MaxGo int
// The min and max CrawlDelay.
// The used value if determined by the robots.txt.
// Must: minCrawlDelay < maxCrawlDelay
MinCrawlDelay, MaxCrawlDelay time.Duration
// A simple logger to slog the database.
Logger *slog.Logger
// Use to fetch all HTTP ressource.
RoundTripper http.RoundTripper
}
type Page ¶
type ProcessFunc ¶
type ProcessFunc func(page *Page)
A function that can be used by Process function.
func (ProcessFunc) Process ¶
func (process ProcessFunc) Process(page *Page)
Click to show internal directories.
Click to hide internal directories.