Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
build
.idea
storage
apify_storage
dist
build
build-docs
node_modules
*.log
Expand Down
92 changes: 57 additions & 35 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions packages/actor-scraper/sitemap-scraper/.actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"actorSpecification": 1,
"name": "sitemap-scraper",
"version": "0.1",
"buildTag": "latest",
"storages": {
"dataset": {
"actorSpecification": 1,
"fields": {},
"views": {}
}
}
}
10 changes: 10 additions & 0 deletions packages/actor-scraper/sitemap-scraper/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules
32 changes: 32 additions & 0 deletions packages/actor-scraper/sitemap-scraper/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
FROM apify/actor-node:22 AS builder

COPY package*.json ./

RUN npm install --include=dev --audit=false

COPY . ./

RUN npm run build

FROM apify/actor-node:22

COPY --from=builder /usr/src/app/dist ./dist

COPY package*.json ./

RUN rm -rf node_modules \
&& npm --quiet set progress=false \
&& npm install --omit=dev \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm

COPY . ./

ENV APIFY_DISABLE_OUTDATED_WARNING=1

CMD npm run start:prod --silent
64 changes: 64 additions & 0 deletions packages/actor-scraper/sitemap-scraper/INPUT_SCHEMA.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"title": "Sitemap Scraper Input",
"type": "object",
"description": "",
"schemaVersion": 1,
"properties": {
"startUrls": {
"sectionCaption": "Basic configuration",
"title": "Start URLs",
"type": "array",
"description": "A static list of domains to scrape. <br><br>For details, see the <a href='https://apify.com/apify/sitemap-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> section in the README.",
"prefill": [
{
"url": "https://docs.apify.com/sitemap.xml"
}
],
"editor": "requestListSources"
},
"proxyConfiguration": {
"sectionCaption": "Proxy and HTTP configuration",
"title": "Proxy configuration",
"type": "object",
"description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/sitemap-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
"prefill": {
"useApifyProxy": true
},
"default": {
"useApifyProxy": true
},
"editor": "proxy"
},
"proxyRotation": {
"title": "Proxy rotation",
"type": "string",
"description": "This property indicates the strategy of proxy rotation and can only be used in conjunction with Apify Proxy. The recommended setting automatically picks the best proxies from your available pool and rotates them evenly, discarding proxies that become blocked or unresponsive. If this strategy does not work for you for any reason, you may configure the scraper to either use a new proxy for each request, or to use one proxy as long as possible, until the proxy fails. IMPORTANT: This setting will only use your available Apify Proxy pool, so if you don't have enough proxies for a given task, no rotation setting will produce satisfactory results.",
"default": "RECOMMENDED",
"editor": "select",
"enum": ["RECOMMENDED", "PER_REQUEST", "UNTIL_FAILURE"],
"enumTitles": [
"Use recommended settings",
"Rotate proxy after each request",
"Use one proxy until failure"
]
},
"maxCrawlingDepth": {
"title": "Max crawling depth",
"editor": "hidden",
"type": "integer",
"description": "Specifies how many sitemap levels the crawler will descend. This value is a safeguard against infinite crawling depths for misconfigured scrapers. <br><br>If set to <code>1</code>, will process only the root sitemap. If set to <code>0</code>, there is no limit.",
"minimum": 0,
"default": 0
},
"maxRequestRetries": {
"title": "Max request retries",
"type": "integer",
"editor": "hidden",
"description": "The maximum number of times the scraper will retry to load each web page on error, in case of a page load error or an exception thrown by the <b>Page function</b>.<br><br>If set to <code>0</code>, the page will be considered failed right after the first error.",
"minimum": 0,
"default": 3,
"unit": "retries"
}
},
"required": ["startUrls", "proxyConfiguration"]
}
41 changes: 41 additions & 0 deletions packages/actor-scraper/sitemap-scraper/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"name": "@apify/actor-sitemap-scraper",
"version": "0.0.1",
"private": true,
"description": "Crawl web pages from a sitemap using HTTP HEAD requests",
"type": "module",
"dependencies": {
"@apify/scraper-tools": "^1.1.4",
"@crawlee/http": "^3.14.1",
"@crawlee/utils": "^3.15.3",
"apify": "^3.2.6",
"impit": "^0.7.5"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/node": "^24.0.0",
"tsx": "^4.19.1",
"typescript": "~5.9.0"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc"
},
"repository": {
"type": "git",
"url": "https://github.com/apify/actor-scraper"
},
"author": {
"name": "Apify Technologies",
"email": "[email protected]",
"url": "https://apify.com"
},
"contributors": [
"Marek Trunkat <[email protected]>",
"Ondra Urban <[email protected]>"
],
"license": "Apache-2.0",
"homepage": "https://github.com/apify/actor-scraper"
}
28 changes: 28 additions & 0 deletions packages/actor-scraper/sitemap-scraper/src/internals/consts.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import type {
Dictionary,
ProxyConfigurationOptions,
RequestOptions,
} from '@crawlee/http';

export const enum ProxyRotation {
Recommended = 'RECOMMENDED',
PerRequest = 'PER_REQUEST',
UntilFailure = 'UNTIL_FAILURE',
}

/**
* Replicates the INPUT_SCHEMA with JavaScript types for quick reference
* and IDE type check integration.
*/
export interface Input {
startUrls: RequestOptions[];
keepUrlFragments: boolean;
respectRobotsTxtFile: boolean;
pageFunction: string;
proxyConfiguration: ProxyConfigurationOptions;
proxyRotation: ProxyRotation;
maxRequestRetries: number;
maxCrawlingDepth: number;
debugLog: boolean;
customData: Dictionary;
}
Loading
Loading