diff --git a/.gitignore b/.gitignore index 81457c62..18c2ba82 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ -build +.idea +storage +apify_storage dist +build build-docs node_modules *.log diff --git a/package-lock.json b/package-lock.json index 4caa654f..1a524bdf 100644 --- a/package-lock.json +++ b/package-lock.json @@ -54,6 +54,10 @@ "license": "MIT", "peer": true }, + "node_modules/@apify/actor-sitemap-scraper": { + "resolved": "packages/actor-scraper/sitemap-scraper", + "link": true + }, "node_modules/@apify/consts": { "version": "2.47.0", "license": "Apache-2.0" @@ -10425,28 +10429,28 @@ } }, "node_modules/impit": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit/-/impit-0.7.4.tgz", - "integrity": "sha512-KkEf01hsNZqKFbJR+vSHPVaMbdb1je5fBiCB6zGwHmlINUx/oTOsVt+HvU6dM380Zc7lAo4r++eqviLrbzSPOw==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit/-/impit-0.7.5.tgz", + "integrity": "sha512-hvdVIxs8+tXlEvwfBjlceqtm3gGGNnvXtB0ol3I9B6bbz4PcZBeo4CqW4gVm9Lazyzpaqk3RRDMsOzWA2OeB8A==", "license": "Apache-2.0", "engines": { "node": ">= 20" }, "optionalDependencies": { - "impit-darwin-arm64": "0.7.4", - "impit-darwin-x64": "0.7.4", - "impit-linux-arm64-gnu": "0.7.4", - "impit-linux-arm64-musl": "0.7.4", - "impit-linux-x64-gnu": "0.7.4", - "impit-linux-x64-musl": "0.7.4", - "impit-win32-arm64-msvc": "0.7.4", - "impit-win32-x64-msvc": "0.7.4" + "impit-darwin-arm64": "0.7.5", + "impit-darwin-x64": "0.7.5", + "impit-linux-arm64-gnu": "0.7.5", + "impit-linux-arm64-musl": "0.7.5", + "impit-linux-x64-gnu": "0.7.5", + "impit-linux-x64-musl": "0.7.5", + "impit-win32-arm64-msvc": "0.7.5", + "impit-win32-x64-msvc": "0.7.5" } }, "node_modules/impit-darwin-arm64": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit-darwin-arm64/-/impit-darwin-arm64-0.7.4.tgz", - "integrity": "sha512-lwhMmGQ/rzBa8lDIjqshlVkFtjCytxCvy4nRuXXvv6BQyN8A7OF1GM9fvLEguCLKi6H+UsM1n8phXzzZFwTA1w==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit-darwin-arm64/-/impit-darwin-arm64-0.7.5.tgz", + "integrity": "sha512-H2HFQ+I8ZIApiSjuyS3WBtTMWHuLUCmMeGGINR8Gp+QDYN4GOpsuwUtKkOt02WsxdkBlCO8FErHXSFkLldusMg==", "cpu": [ "arm64" ], @@ -10460,9 +10464,9 @@ } }, "node_modules/impit-darwin-x64": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit-darwin-x64/-/impit-darwin-x64-0.7.4.tgz", - "integrity": "sha512-WHCaUFYaWnNzrI0tPRYVaeTdtGRTUpMPwkKmvNK087THHTMSJcx6P8b7H9YWLsihh4EBuuHr5p8s5kMp66+9TA==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit-darwin-x64/-/impit-darwin-x64-0.7.5.tgz", + "integrity": "sha512-2KvQHwpo2XpiNfLjTblU3hq7AMZkw3aYXcPCYVbWzx7EewQqFwH8jqsx1oP+rtGGJf6QDAN4KtUhPEC6ouJu5g==", "cpu": [ "x64" ], @@ -10476,9 +10480,9 @@ } }, "node_modules/impit-linux-arm64-gnu": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit-linux-arm64-gnu/-/impit-linux-arm64-gnu-0.7.4.tgz", - "integrity": "sha512-jzrjOFDiARlZGvjkyw7fU8pafdIEd3WcNTBnuJfb1Z5kJNMf+jg6fCAnndbv84Zy7GJ6/UUJJiKgASeyv84JRw==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit-linux-arm64-gnu/-/impit-linux-arm64-gnu-0.7.5.tgz", + "integrity": "sha512-yyr9lxhXWJASyuoCQB+Tg9UFXNLamjx0AT6UPGHGOSfIt/Dc5sB06w1u7xK5igg4N5l3WyR7wNMNtS977UDhqw==", "cpu": [ "arm64" ], @@ -10492,9 +10496,9 @@ } }, "node_modules/impit-linux-arm64-musl": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit-linux-arm64-musl/-/impit-linux-arm64-musl-0.7.4.tgz", - "integrity": "sha512-8gvMBzsQENuiHupojg1KopURKlNOP/wU2RyHKEKFiVWITN3DOoCx7axCpX7TlKVo1UaY6cFF6P6rkCF8Agk5Pw==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit-linux-arm64-musl/-/impit-linux-arm64-musl-0.7.5.tgz", + "integrity": "sha512-N7K0EA7NFNlLOhHlZg21IEHMV4POMuwmPoEjW3drxrl6DuJcdDJzGg1fFygYwRmmIWu5OuhPbIyOoB+8pEfFAw==", "cpu": [ "arm64" ], @@ -10508,9 +10512,9 @@ } }, "node_modules/impit-linux-x64-gnu": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit-linux-x64-gnu/-/impit-linux-x64-gnu-0.7.4.tgz", - "integrity": "sha512-P1HAXpNt5RilAGtY5iB9Qm+GPfhESdp8bot2h8aWrhvuuxZ2F7aiZEJTg10ithyw4Z5nV52GeC6Jzw4FbP9Rew==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit-linux-x64-gnu/-/impit-linux-x64-gnu-0.7.5.tgz", + "integrity": "sha512-Afqc6KlYrdkV9UvhdBRLlC/PXbam1i5VOZwe60jzTMACVHTt/MvYTLSx+oAIqb6pZqPQ6LRGbLGMuLMDddCXbQ==", "cpu": [ "x64" ], @@ -10524,9 +10528,9 @@ } }, "node_modules/impit-linux-x64-musl": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit-linux-x64-musl/-/impit-linux-x64-musl-0.7.4.tgz", - "integrity": "sha512-NnaRuKubNyLDNdre+2rDwFL96AesWhS5sKuLv5JVOsAGjDiDeNK8t1FsAm6n6OoPQMeXtFPNGns2cxuI5oabuQ==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit-linux-x64-musl/-/impit-linux-x64-musl-0.7.5.tgz", + "integrity": "sha512-bf61Gg5FWRKeQXjzKtnLnlgXPQC0MoX8Pwu4b8aNMMzPXj71kRiWWtMGIdyeMGxDR/IcRQXDp+M5yz0WyysZfw==", "cpu": [ "x64" ], @@ -10540,9 +10544,9 @@ } }, "node_modules/impit-win32-arm64-msvc": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit-win32-arm64-msvc/-/impit-win32-arm64-msvc-0.7.4.tgz", - "integrity": "sha512-VAABh2+qhLyD1Sn8TmfypxFN7ZSBBZsHf3aDEIhVQDL3dxK/lAEUVxwX/5nQd+M8LWr3QBoidvtzQKyo9uR2rw==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit-win32-arm64-msvc/-/impit-win32-arm64-msvc-0.7.5.tgz", + "integrity": "sha512-6QWk/AxzNepRv3T4ZV0le7kYgOzSRR8MgaPYqdXr+NWz/Sta43+NJNn3IBXKoW+fzYs7rcaHd6X6cpmf20W+tw==", "cpu": [ "arm64" ], @@ -10556,9 +10560,9 @@ } }, "node_modules/impit-win32-x64-msvc": { - "version": "0.7.4", - "resolved": "https://registry.npmjs.org/impit-win32-x64-msvc/-/impit-win32-x64-msvc-0.7.4.tgz", - "integrity": "sha512-BjBw0JZ07cLTxi1HLCAhCTmNqRCb+u4H1Vy0HNj+r6TiLrPQM/OyzwBVczo8uqaJfwzwI928wsJhoVPxbbJpcw==", + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/impit-win32-x64-msvc/-/impit-win32-x64-msvc-0.7.5.tgz", + "integrity": "sha512-CAbkxZHmthPcq7M9ag7pq1PimTO8CEAjDmdvvAGt8lLjMtUet379g2hgPGXIFOBckwaPGVVME5mqWfDoJ1NrWw==", "cpu": [ "x64" ], @@ -18461,6 +18465,24 @@ "typescript": "~5.9.0" } }, + "packages/actor-scraper/sitemap-scraper": { + "name": "@apify/actor-sitemap-scraper", + "version": "0.0.1", + "license": "Apache-2.0", + "dependencies": { + "@apify/scraper-tools": "^1.1.4", + "@crawlee/http": "^3.14.1", + "@crawlee/utils": "^3.15.3", + "apify": "^3.2.6", + "impit": "^0.7.5" + }, + "devDependencies": { + "@apify/tsconfig": "^0.1.0", + "@types/node": "^24.0.0", + "tsx": "^4.19.1", + "typescript": "~5.9.0" + } + }, "packages/actor-scraper/web-scraper": { "name": "actor-web-scraper", "version": "3.1.0", diff --git a/packages/actor-scraper/sitemap-scraper/.actor/actor.json b/packages/actor-scraper/sitemap-scraper/.actor/actor.json new file mode 100644 index 00000000..64050569 --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/.actor/actor.json @@ -0,0 +1,13 @@ +{ + "actorSpecification": 1, + "name": "sitemap-scraper", + "version": "0.1", + "buildTag": "latest", + "storages": { + "dataset": { + "actorSpecification": 1, + "fields": {}, + "views": {} + } + } +} diff --git a/packages/actor-scraper/sitemap-scraper/.dockerignore b/packages/actor-scraper/sitemap-scraper/.dockerignore new file mode 100644 index 00000000..ceb85b1c --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/.dockerignore @@ -0,0 +1,10 @@ +# configurations +.idea + +# crawlee and apify storage folders +apify_storage +crawlee_storage +storage + +# installed files +node_modules diff --git a/packages/actor-scraper/sitemap-scraper/Dockerfile b/packages/actor-scraper/sitemap-scraper/Dockerfile new file mode 100644 index 00000000..f68aa615 --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/Dockerfile @@ -0,0 +1,32 @@ +FROM apify/actor-node:22 AS builder + +COPY package*.json ./ + +RUN npm install --include=dev --audit=false + +COPY . ./ + +RUN npm run build + +FROM apify/actor-node:22 + +COPY --from=builder /usr/src/app/dist ./dist + +COPY package*.json ./ + +RUN rm -rf node_modules \ + && npm --quiet set progress=false \ + && npm install --omit=dev \ + && echo "Installed NPM packages:" \ + && (npm list --omit=dev --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version \ + && rm -r ~/.npm + +COPY . ./ + +ENV APIFY_DISABLE_OUTDATED_WARNING=1 + +CMD npm run start:prod --silent diff --git a/packages/actor-scraper/sitemap-scraper/INPUT_SCHEMA.json b/packages/actor-scraper/sitemap-scraper/INPUT_SCHEMA.json new file mode 100644 index 00000000..99b80db3 --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/INPUT_SCHEMA.json @@ -0,0 +1,64 @@ +{ + "title": "Sitemap Scraper Input", + "type": "object", + "description": "", + "schemaVersion": 1, + "properties": { + "startUrls": { + "sectionCaption": "Basic configuration", + "title": "Start URLs", + "type": "array", + "description": "A static list of domains to scrape.

For details, see the Start URLs section in the README.", + "prefill": [ + { + "url": "https://docs.apify.com/sitemap.xml" + } + ], + "editor": "requestListSources" + }, + "proxyConfiguration": { + "sectionCaption": "Proxy and HTTP configuration", + "title": "Proxy configuration", + "type": "object", + "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.

For details, see Proxy configuration in README.", + "prefill": { + "useApifyProxy": true + }, + "default": { + "useApifyProxy": true + }, + "editor": "proxy" + }, + "proxyRotation": { + "title": "Proxy rotation", + "type": "string", + "description": "This property indicates the strategy of proxy rotation and can only be used in conjunction with Apify Proxy. The recommended setting automatically picks the best proxies from your available pool and rotates them evenly, discarding proxies that become blocked or unresponsive. If this strategy does not work for you for any reason, you may configure the scraper to either use a new proxy for each request, or to use one proxy as long as possible, until the proxy fails. IMPORTANT: This setting will only use your available Apify Proxy pool, so if you don't have enough proxies for a given task, no rotation setting will produce satisfactory results.", + "default": "RECOMMENDED", + "editor": "select", + "enum": ["RECOMMENDED", "PER_REQUEST", "UNTIL_FAILURE"], + "enumTitles": [ + "Use recommended settings", + "Rotate proxy after each request", + "Use one proxy until failure" + ] + }, + "maxCrawlingDepth": { + "title": "Max crawling depth", + "editor": "hidden", + "type": "integer", + "description": "Specifies how many sitemap levels the crawler will descend. This value is a safeguard against infinite crawling depths for misconfigured scrapers.

If set to 1, will process only the root sitemap. If set to 0, there is no limit.", + "minimum": 0, + "default": 0 + }, + "maxRequestRetries": { + "title": "Max request retries", + "type": "integer", + "editor": "hidden", + "description": "The maximum number of times the scraper will retry to load each web page on error, in case of a page load error or an exception thrown by the Page function.

If set to 0, the page will be considered failed right after the first error.", + "minimum": 0, + "default": 3, + "unit": "retries" + } + }, + "required": ["startUrls", "proxyConfiguration"] +} diff --git a/packages/actor-scraper/sitemap-scraper/package.json b/packages/actor-scraper/sitemap-scraper/package.json new file mode 100644 index 00000000..ae45a93e --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/package.json @@ -0,0 +1,41 @@ +{ + "name": "@apify/actor-sitemap-scraper", + "version": "0.0.1", + "private": true, + "description": "Crawl web pages from a sitemap using HTTP HEAD requests", + "type": "module", + "dependencies": { + "@apify/scraper-tools": "^1.1.4", + "@crawlee/http": "^3.14.1", + "@crawlee/utils": "^3.15.3", + "apify": "^3.2.6", + "impit": "^0.7.5" + }, + "devDependencies": { + "@apify/tsconfig": "^0.1.0", + "@types/node": "^24.0.0", + "tsx": "^4.19.1", + "typescript": "~5.9.0" + }, + "scripts": { + "start": "npm run start:dev", + "start:prod": "node dist/main.js", + "start:dev": "tsx src/main.ts", + "build": "tsc" + }, + "repository": { + "type": "git", + "url": "https://github.com/apify/actor-scraper" + }, + "author": { + "name": "Apify Technologies", + "email": "support@apify.com", + "url": "https://apify.com" + }, + "contributors": [ + "Marek Trunkat ", + "Ondra Urban " + ], + "license": "Apache-2.0", + "homepage": "https://github.com/apify/actor-scraper" +} diff --git a/packages/actor-scraper/sitemap-scraper/src/internals/consts.ts b/packages/actor-scraper/sitemap-scraper/src/internals/consts.ts new file mode 100644 index 00000000..889e21fa --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/src/internals/consts.ts @@ -0,0 +1,28 @@ +import type { + Dictionary, + ProxyConfigurationOptions, + RequestOptions, +} from '@crawlee/http'; + +export const enum ProxyRotation { + Recommended = 'RECOMMENDED', + PerRequest = 'PER_REQUEST', + UntilFailure = 'UNTIL_FAILURE', +} + +/** + * Replicates the INPUT_SCHEMA with JavaScript types for quick reference + * and IDE type check integration. + */ +export interface Input { + startUrls: RequestOptions[]; + keepUrlFragments: boolean; + respectRobotsTxtFile: boolean; + pageFunction: string; + proxyConfiguration: ProxyConfigurationOptions; + proxyRotation: ProxyRotation; + maxRequestRetries: number; + maxCrawlingDepth: number; + debugLog: boolean; + customData: Dictionary; +} diff --git a/packages/actor-scraper/sitemap-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/sitemap-scraper/src/internals/crawler_setup.ts new file mode 100644 index 00000000..b9f0221b --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/src/internals/crawler_setup.ts @@ -0,0 +1,454 @@ +import { readFile } from 'node:fs/promises'; +import type { IncomingMessage } from 'node:http'; +import { URL } from 'node:url'; + +import type { + Dictionary, + HttpCrawlerOptions, + HttpCrawlingContext, + InternalHttpCrawlingContext, + ProxyConfiguration, + Request, + RequestOptions, +} from '@crawlee/http'; +import { + createHttpRouter, + Dataset, + HttpCrawler, + KeyValueStore, + log, + RequestList, + RequestQueueV2, +} from '@crawlee/http'; +import type { ApifyEnv } from 'apify'; +import { Actor } from 'apify'; + +import type { + CrawlerSetupOptions, + RequestMetadata, +} from '@apify/scraper-tools'; +import { + constants as scraperToolsConstants, + tools, +} from '@apify/scraper-tools'; + +import type { Input } from './consts.js'; +import { ProxyRotation } from './consts.js'; +import { parseSitemap } from '@crawlee/utils'; +import { discoverValidSitemaps } from './tools.js'; + +const { META_KEY } = scraperToolsConstants; + +const { SESSION_MAX_USAGE_COUNTS } = scraperToolsConstants; +const SCHEMA = JSON.parse( + await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'), +); + +const REQUESTS_BATCH_SIZE = 25; + +const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9; +const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED'; + +/** + * Holds all the information necessary for constructing a crawler + * instance and creating a context for a pageFunction invocation. + */ +export class CrawlerSetup implements CrawlerSetupOptions { + name = 'Sitemap Scraper'; + rawInput: string; + env: ApifyEnv; + /** + * Used to store data that persist navigations + */ + globalStore = new Map(); + requestQueue: RequestQueueV2; + keyValueStore: KeyValueStore; + customData: unknown; + input: Input; + maxSessionUsageCount: number; + crawler!: HttpCrawler; + dataset!: Dataset; + pagesOutputted!: number; + proxyConfiguration?: ProxyConfiguration; + private initPromise: Promise; + protected readonly schema: object = SCHEMA; + + constructor(input: Input) { + // Set log level early to prevent missed messages. + if (input.debugLog) log.setLevel(log.LEVELS.DEBUG); + + // Keep this as string to be immutable. + this.rawInput = JSON.stringify(input); + + // Validate INPUT if not running on Apify Cloud Platform. + if (!Actor.isAtHome()) tools.checkInputOrThrow(input, this.schema); + + this.input = input; + this.env = Actor.getEnv(); + + // solving proxy rotation settings + this.maxSessionUsageCount = + SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation]; + + // Initialize async operations. + this.crawler = null!; + this.requestQueue = null!; + this.dataset = null!; + this.keyValueStore = null!; + this.proxyConfiguration = null!; + this.initPromise = this._initializeAsync(); + } + + private readonly PAGE_LABEL = 'PAGE'; + + private _createRequestHandler() { + const router = createHttpRouter(); + router.addHandler(this.PAGE_LABEL, this._handlePageRequest.bind(this)); + router.addDefaultHandler(this._handleSitemapRequest.bind(this)); + return router; + } + + private async _initializeAsync() { + const discoveredSitemaps = await discoverValidSitemaps( + this.input.startUrls + .map((x) => x.url) + .filter((x) => x !== undefined), + this.proxyConfiguration, + ); + if (discoveredSitemaps.size === 0) { + throw await Actor.fail( + 'No valid sitemaps were discovered from the provided startUrls.', + ); + } + + // RequestList + const startRequest: RequestOptions[] = [...discoveredSitemaps].map( + (sitemapUrl) => ({ + url: sitemapUrl, + useExtendedUniqueKey: true, + keepUrlFragment: this.input.keepUrlFragments, + // sitemaps are fetched inside the handler + skipNavigation: true, + }), + ); + + // KeyValueStore + this.keyValueStore = await KeyValueStore.open(); + + // RequestQueue + this.requestQueue = await RequestQueueV2.open(); + + if ( + !(await this.keyValueStore.recordExists( + REQUEST_QUEUE_INIT_FLAG_KEY, + )) + ) { + const requests: Request[] = []; + for await (const request of await RequestList.open( + null, + startRequest, + )) { + requests.push(request); + } + + const { waitForAllRequestsToBeAdded } = + await this.requestQueue.addRequestsBatched(requests); + + void waitForAllRequestsToBeAdded.then(async () => { + await this.keyValueStore.setValue( + REQUEST_QUEUE_INIT_FLAG_KEY, + '1', + ); + }); + } + + // Dataset + this.dataset = await Dataset.open(); + const info = await this.dataset.getInfo(); + this.pagesOutputted = info?.itemCount ?? 0; + + // Proxy configuration + this.proxyConfiguration = (await Actor.createProxyConfiguration( + this.input.proxyConfiguration, + )) as any as ProxyConfiguration; + } + + /** + * Resolves to a `HttpCrawler` instance. + */ + async createCrawler() { + await this.initPromise; + + const options: HttpCrawlerOptions = { + proxyConfiguration: this.proxyConfiguration, + requestHandler: this._createRequestHandler(), + preNavigationHooks: [], + postNavigationHooks: [], + requestQueue: this.requestQueue, + failedRequestHandler: this._failedRequestHandler.bind(this), + respectRobotsTxtFile: this.input.respectRobotsTxtFile, + maxRequestRetries: this.input.maxRequestRetries, + autoscaledPoolOptions: { + systemStatusOptions: { + maxEventLoopOverloadedRatio: + MAX_EVENT_LOOP_OVERLOADED_RATIO, + }, + }, + // this scraper just outputs the returned status code, so we don't treat any as an error + ignoreHttpErrorStatusCodes: Array.from( + { length: 100 }, + (_, i) => 500 + i, + ), + useSessionPool: true, + persistCookiesPerSession: true, + sessionPoolOptions: { + blockedStatusCodes: [], + sessionOptions: { + maxUsageCount: this.maxSessionUsageCount, + }, + }, + experiments: { + requestLocking: true, + }, + }; + + this._createNavigationHooks(options); + + if (this.input.proxyRotation === ProxyRotation.UntilFailure) { + options.sessionPoolOptions!.maxPoolSize = 1; + } + + this.crawler = new HttpCrawler(options); + + return this.crawler; + } + + private _createNavigationHooks(options: HttpCrawlerOptions) { + options.preNavigationHooks!.push(async ({ request }) => { + // Normalize headers + request.headers = Object.entries(request.headers ?? {}).reduce( + (newHeaders, [key, value]) => { + newHeaders[key.toLowerCase()] = value; + return newHeaders; + }, + {} as Dictionary, + ); + }); + } + + private async _failedRequestHandler({ request }: HttpCrawlingContext) { + const lastError = + request.errorMessages[request.errorMessages.length - 1]; + const errorMessage = lastError ? lastError.split('\n')[0] : 'no error'; + log.error( + `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`, + ); + return this._handleResult(request, undefined, undefined, true); + } + + /** + * Parses the sitemap if it's one and enqueues HEAD requests. Otherwise pushes + * the response data to the dataset. + */ + protected async _handleSitemapRequest( + crawlingContext: HttpCrawlingContext, + ) { + const { request } = crawlingContext; + + // Make sure that an object containing internal metadata + // is present on every request. + tools.ensureMetaData(request); + + log.info('Processing sitemap', { url: request.url }); + const parsed = parseSitemap( + [{ type: 'url', url: request.url }], + await this.proxyConfiguration?.newUrl(), + { + emitNestedSitemaps: true, + maxDepth: 0, + }, + ); + + const nestedSitemaps: string[] = []; + const urls: string[] = []; + let scrapedAnyPageUrls = false; + let scrapedAnySitemapUrls = false; + + const flushUrls = async () => { + if (urls.length === 0) return; + await this._enqueuePageRequests(urls, crawlingContext); + urls.length = 0; + }; + + const flushSitemaps = async () => { + if (nestedSitemaps.length === 0) return; + await this._enqueueSitemapRequests(nestedSitemaps, crawlingContext); + nestedSitemaps.length = 0; + }; + for await (const item of parsed) { + if (!item.originSitemapUrl) { + log.debug('Handling nested sitemap', { + url: item.loc, + }); + + nestedSitemaps.push(item.loc); + scrapedAnySitemapUrls = true; + } else { + log.debug('Handling url from sitemap', { + url: item.loc, + }); + + urls.push(item.loc); + scrapedAnyPageUrls = true; + } + + if (nestedSitemaps.length >= REQUESTS_BATCH_SIZE) { + await flushSitemaps(); + } + + if (urls.length >= REQUESTS_BATCH_SIZE) { + await flushUrls(); + } + } + + await flushSitemaps(); + await flushUrls(); + + const { hasReachedMaxDepth, currentDepth } = + this._hasSitemapReachedMaxDepth(request); + if ( + hasReachedMaxDepth && + !scrapedAnyPageUrls && + scrapedAnySitemapUrls + ) { + log.warning( + "Reached max depth limit at a sitemap containing only sitemaps. Increase your `maxCrawlingDepth` if this wasn't intended", + { + sitemapUrl: request.url, + currentDepth, + }, + ); + } + } + + protected async _handlePageRequest(crawlingContext: HttpCrawlingContext) { + const { request, response } = crawlingContext; + + // Make sure that an object containing internal metadata + // is present on every request. + tools.ensureMetaData(request); + + const result = { + url: request.url, + status: response.statusCode, + }; + + // Save the `pageFunction`s result to the default dataset. + await this._handleResult(request, response, result); + } + + private async _handleResult( + request: Request, + response?: IncomingMessage, + pageFunctionResult?: Dictionary, + isError?: boolean, + ) { + const payload = tools.createDatasetPayload( + request, + response, + pageFunctionResult, + isError, + ); + await this.dataset.pushData(payload); + + if (this.pagesOutputted > 0 && this.pagesOutputted % 100 === 0) { + log.info( + `Pushed ${this.pagesOutputted} items to the dataset so far.`, + ); + } + this.pagesOutputted++; + } + + private _hasSitemapReachedMaxDepth(request: Request): { + hasReachedMaxDepth: boolean; + currentDepth: number; + } { + /** + * The depth of the parent sitemap + */ + const currentDepth = (request.userData![META_KEY] as RequestMetadata) + .depth; + const hasReachedMaxDepth = + this.input.maxCrawlingDepth && + currentDepth + 1 >= this.input.maxCrawlingDepth; + return { + hasReachedMaxDepth: Boolean(hasReachedMaxDepth), + currentDepth, + }; + } + + private async _enqueueSitemapRequests( + urls: string[], + { request, enqueueLinks }: HttpCrawlingContext, + ): Promise<{ + reachedMaxDepth: boolean; + }> { + const { hasReachedMaxDepth, currentDepth } = + this._hasSitemapReachedMaxDepth(request); + if (hasReachedMaxDepth) { + log.debug( + `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`, + ); + return { + reachedMaxDepth: true, + }; + } + + await enqueueLinks({ + urls, + transformRequestFunction: (requestOptions) => { + requestOptions.userData ??= {}; + requestOptions.userData[META_KEY] = { + parentRequestId: request.id || request.uniqueKey, + depth: currentDepth + 1, + }; + + requestOptions.useExtendedUniqueKey = true; + requestOptions.keepUrlFragment = this.input.keepUrlFragments; + return requestOptions; + }, + }); + + return { + reachedMaxDepth: false, + }; + } + + private async _enqueuePageRequests( + urls: string[], + { request, enqueueLinks }: HttpCrawlingContext, + ) { + const currentDepth = (request.userData![META_KEY] as RequestMetadata) + .depth; + + // NOTE: depth check when enqueueing pages is not needed, since the one + // for sitemaps will do the job + + await enqueueLinks({ + urls, + label: this.PAGE_LABEL, + transformRequestFunction: (requestOptions) => { + requestOptions.userData ??= {}; + requestOptions.userData[META_KEY] = { + parentRequestId: request.id || request.uniqueKey, + depth: currentDepth + 1, + }; + + requestOptions.useExtendedUniqueKey = true; + requestOptions.keepUrlFragment = this.input.keepUrlFragments; + requestOptions.method = 'HEAD'; + return requestOptions; + }, + }); + } +} diff --git a/packages/actor-scraper/sitemap-scraper/src/internals/index.ts b/packages/actor-scraper/sitemap-scraper/src/internals/index.ts new file mode 100644 index 00000000..1fa9bbd8 --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/src/internals/index.ts @@ -0,0 +1,2 @@ +export type { Input } from './consts.js'; +export { CrawlerSetup } from './crawler_setup.js'; diff --git a/packages/actor-scraper/sitemap-scraper/src/internals/tools.ts b/packages/actor-scraper/sitemap-scraper/src/internals/tools.ts new file mode 100644 index 00000000..b927acee --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/src/internals/tools.ts @@ -0,0 +1,127 @@ +import type { ProxyConfiguration } from '@crawlee/http'; +import { log } from '@crawlee/core'; +import { RobotsFile } from '@crawlee/utils'; +import { Impit } from 'impit'; + +export const SITEMAP_REQUEST_TIMEOUT_MILLIS = 30e3; + +/** + * Given a list of URLs, discover related sitemap files for these domains by checking the `robots.txt` file, + * the default `sitemap.xml` file and the URLs themselves. + * @param urls The list of URLs to discover sitemaps for. + * @param proxy The proxy configuration instance to use for the request making. + * @returns An `Set` with the discovered sitemap URLs. + */ +export async function discoverValidSitemaps( + urls: string[], + proxy?: ProxyConfiguration, +): Promise> { + log.info('Discovering possible sitemap files from the start URLs...'); + + const sitemapUrls = new Set(); + + const addSitemapUrl = (url: string) => { + const sizeBefore = sitemapUrls.size; + + sitemapUrls.add(url); + + if (sitemapUrls.size > sizeBefore) { + log.info(`Found sitemap url '${url}'`); + } + }; + + const proxyUrl = await proxy?.newUrl(); + + const discoverSitemapsForDomainUrls = async ( + hostname: string, + domainUrls: string[], + ) => { + if (!hostname) { + return; + } + + log.info(`Discovering possible sitemap files for '${hostname}'...`); + + try { + const robotsFile = await RobotsFile.find(domainUrls[0], proxyUrl); + + for (const sitemapUrl of robotsFile.getSitemaps()) { + addSitemapUrl(sitemapUrl); + } + } catch (err) { + log.warning(`Failed to fetch robots.txt file for ${hostname}`, { + error: err, + }); + } + + const sitemapUrl = domainUrls.find((url) => + /sitemap\.(?:xml|txt)(?:\.gz)?$/i.test(url), + ); + + if (sitemapUrl !== undefined) { + addSitemapUrl(sitemapUrl); + } else { + const firstUrl = new URL(domainUrls[0]); + firstUrl.pathname = '/sitemap.xml'; + if (await urlExists(firstUrl.toString(), proxyUrl)) { + addSitemapUrl(firstUrl.toString()); + } + + firstUrl.pathname = '/sitemap.txt'; + if (await urlExists(firstUrl.toString(), proxyUrl)) { + addSitemapUrl(firstUrl.toString()); + } + } + }; + + await Promise.all( + Object.entries( + Object.groupBy(urls, (url) => + URL.canParse(url) ? new URL(url).hostname : '', + ), + ).map(async ([hostname, domainUrls]) => + discoverSitemapsForDomainUrls(hostname, domainUrls ?? []), + ), + ); + + if (sitemapUrls.size > 0) { + log.info( + `Sitemap discovery finished, found ${sitemapUrls.size} sitemap URLs`, + ); + } else { + log.warning( + 'Sitemap discovery finished, no sitemaps were found for the provided start URLs.', + ); + } + + return sitemapUrls; +} + +/** + * Check if a document with the given URL exists by making a `HEAD` request to it. + * @param url The URL to check. + * @param proxyUrl The proxy URL to use for the request. + * @returns A `Promise` that resolves to `true` if the URL exists, `false` otherwise. + */ +export async function urlExists( + url: string, + proxyUrl?: string, +): Promise { + try { + const response = await new Impit({ + browser: 'firefox', + proxyUrl, + ignoreTlsErrors: true, + }).fetch(url, { + method: 'HEAD', + }); + + if (response.status < 200 || response.status >= 400) { + return false; + } + + return true; + } catch { + return false; + } +} diff --git a/packages/actor-scraper/sitemap-scraper/src/main.ts b/packages/actor-scraper/sitemap-scraper/src/main.ts new file mode 100644 index 00000000..0d3d9d20 --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/src/main.ts @@ -0,0 +1,5 @@ +import { runActor } from '@apify/scraper-tools'; + +import { CrawlerSetup } from './internals/crawler_setup.js'; + +runActor(CrawlerSetup); diff --git a/packages/actor-scraper/sitemap-scraper/tsconfig.json b/packages/actor-scraper/sitemap-scraper/tsconfig.json new file mode 100644 index 00000000..bf3a4188 --- /dev/null +++ b/packages/actor-scraper/sitemap-scraper/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "@apify/tsconfig", + "compilerOptions": { + "outDir": "dist", + "module": "ESNext", + "allowJs": true, + "skipLibCheck": true + }, + "include": ["src"] +}