Initial commit
This commit is contained in:
8
.dockerignore
Normal file
8
.dockerignore
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# configurations
|
||||||
|
.idea
|
||||||
|
|
||||||
|
# crawlee storage folder
|
||||||
|
storage
|
||||||
|
|
||||||
|
# installed files
|
||||||
|
node_modules
|
||||||
5
.env
Normal file
5
.env
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
POSTGRES_HOST=172.17.0.1
|
||||||
|
POSTGRES_PORT=5432
|
||||||
|
POSTGRES_DB=fundamnit
|
||||||
|
POSTGRES_USER=fundamnit
|
||||||
|
POSTGRES_PASSWORD=fundamnit
|
||||||
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# This file tells Git which files shouldn't be added to source control
|
||||||
|
|
||||||
|
.idea
|
||||||
|
dist
|
||||||
|
node_modules
|
||||||
|
storage
|
||||||
51
Dockerfile
Normal file
51
Dockerfile
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# Specify the base Docker image. You can read more about
|
||||||
|
# the available images at https://crawlee.dev/docs/guides/docker-images
|
||||||
|
# You can also use any other image from Docker Hub.
|
||||||
|
FROM apify/actor-node-playwright-chrome:20 AS builder
|
||||||
|
|
||||||
|
# Copy just package.json and package-lock.json
|
||||||
|
# to speed up the build using Docker layer cache.
|
||||||
|
COPY --chown=myuser package*.json ./
|
||||||
|
|
||||||
|
# Install all dependencies. Don't audit to speed up the installation.
|
||||||
|
RUN npm install --include=dev --audit=false
|
||||||
|
|
||||||
|
# Next, copy the source files using the user set
|
||||||
|
# in the base image.
|
||||||
|
COPY --chown=myuser . ./
|
||||||
|
|
||||||
|
# Install all dependencies and build the project.
|
||||||
|
# Don't audit to speed up the installation.
|
||||||
|
RUN npm run build
|
||||||
|
|
||||||
|
# Create final image
|
||||||
|
FROM apify/actor-node-playwright-chrome:20
|
||||||
|
|
||||||
|
# Copy only built JS files from builder image
|
||||||
|
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
|
||||||
|
|
||||||
|
# Copy just package.json and package-lock.json
|
||||||
|
# to speed up the build using Docker layer cache.
|
||||||
|
COPY --chown=myuser package*.json ./
|
||||||
|
|
||||||
|
# Install NPM packages, skip optional and development dependencies to
|
||||||
|
# keep the image small. Avoid logging too much and print the dependency
|
||||||
|
# tree for debugging
|
||||||
|
RUN npm --quiet set progress=false \
|
||||||
|
&& npm install --omit=dev --omit=optional \
|
||||||
|
&& echo "Installed NPM packages:" \
|
||||||
|
&& (npm list --omit=dev --all || true) \
|
||||||
|
&& echo "Node.js version:" \
|
||||||
|
&& node --version \
|
||||||
|
&& echo "NPM version:" \
|
||||||
|
&& npm --version
|
||||||
|
|
||||||
|
# Next, copy the remaining files and directories with the source code.
|
||||||
|
# Since we do this after NPM install, quick build will be really fast
|
||||||
|
# for most source file changes.
|
||||||
|
COPY --chown=myuser . ./
|
||||||
|
|
||||||
|
|
||||||
|
# Run the image. If you know you won't need headful browsers,
|
||||||
|
# you can remove the XVFB start script for a micro perf gain.
|
||||||
|
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
|
||||||
17
Justfile
Normal file
17
Justfile
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
set dotenv-load
|
||||||
|
|
||||||
|
run_sling:
|
||||||
|
docker run -it --rm \
|
||||||
|
-v ${PWD}/replication.yaml:/replication.yaml \
|
||||||
|
-v ${PWD}/storage:/storage \
|
||||||
|
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
||||||
|
-e SLING_DISABLE_TELEMETRY=true \
|
||||||
|
docker.io/slingdata/sling run -r /replication.yaml
|
||||||
|
|
||||||
|
run:
|
||||||
|
docker run -it --rm \
|
||||||
|
-v ${PWD}/replication.yaml:/replication.yaml \
|
||||||
|
-v ${PWD}/storage:/storage \
|
||||||
|
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
||||||
|
--entrypoint bash \
|
||||||
|
docker.io/slingdata/sling:v1.4.24
|
||||||
9
README.md
Normal file
9
README.md
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Getting started with Crawlee
|
||||||
|
|
||||||
|
This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
|
||||||
|
|
||||||
|
You can find more examples and documentation at the following links:
|
||||||
|
|
||||||
|
- [Step-by-step tutorial](https://crawlee.dev/js/docs/introduction) for Crawlee
|
||||||
|
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/js/api/playwright-crawler/class/PlaywrightCrawler)
|
||||||
|
- Other [examples](https://crawlee.dev/js/docs/examples/playwright-crawler)
|
||||||
9
docker-compose.yaml
Normal file
9
docker-compose.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: docker.io/library/postgres:18
|
||||||
|
ports:
|
||||||
|
- ${POSTGRES_PORT}:5432
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: ${POSTGRES_USER}
|
||||||
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
|
POSTGRES_DB: ${POSTGRES_DB}
|
||||||
4175
package-lock.json
generated
Normal file
4175
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
27
package.json
Normal file
27
package.json
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
{
|
||||||
|
"name": "fundamnit",
|
||||||
|
"version": "0.0.1",
|
||||||
|
"type": "module",
|
||||||
|
"description": "This is an example of a Crawlee project.",
|
||||||
|
"dependencies": {
|
||||||
|
"crawlee": "^3.0.0",
|
||||||
|
"patchright": "^1.56.1",
|
||||||
|
"playwright": "*",
|
||||||
|
"zod": "^4.1.12"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@apify/tsconfig": "^0.1.0",
|
||||||
|
"tsx": "^4.4.0",
|
||||||
|
"typescript": "~5.9.0",
|
||||||
|
"@types/node": "^22.0.0"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"start": "npm run start:dev",
|
||||||
|
"start:prod": "node dist/main.js",
|
||||||
|
"start:dev": "tsx src/main.ts",
|
||||||
|
"build": "tsc",
|
||||||
|
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
|
||||||
|
},
|
||||||
|
"author": "It's not you it's me",
|
||||||
|
"license": "ISC"
|
||||||
|
}
|
||||||
11
replication.yaml
Normal file
11
replication.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
source: LOCAL
|
||||||
|
target: POSTGRES
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
mode: full-refresh
|
||||||
|
object: funda.projects
|
||||||
|
source_options:
|
||||||
|
format: json
|
||||||
|
|
||||||
|
streams:
|
||||||
|
"file:///storage/datasets/projects/":
|
||||||
58
src/main.ts
Normal file
58
src/main.ts
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
import { PlaywrightCrawler } from 'crawlee';
|
||||||
|
import { chromium, Locator, Page } from 'patchright';
|
||||||
|
import * as z from 'zod';
|
||||||
|
|
||||||
|
const PROJECT_URL = /https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*/;
|
||||||
|
|
||||||
|
function parseNuxtData(data: unknown[], offset: number = 0) {
|
||||||
|
let item = data[offset];
|
||||||
|
|
||||||
|
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined") {
|
||||||
|
return item;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Array.isArray(item)) {
|
||||||
|
if (item[0] === "ShallowReactive") {
|
||||||
|
return parseNuxtData(data, item[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let arr = [];
|
||||||
|
for (const idx of item) {
|
||||||
|
arr.push(parseNuxtData(data, idx))
|
||||||
|
}
|
||||||
|
|
||||||
|
return arr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof item === 'object' && item !== null) {
|
||||||
|
let obj: Record<any, any> = {}
|
||||||
|
for (const [key, value] of Object.entries(item)) {
|
||||||
|
obj[key] = parseNuxtData(data, parseInt(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
return obj;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Unsupported item type: ${item}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
const crawler = new PlaywrightCrawler({
|
||||||
|
async requestHandler({ request, page, enqueueLinks, pushData }) {
|
||||||
|
const title = await page.title();
|
||||||
|
|
||||||
|
if (PROJECT_URL.test(request.url)) {
|
||||||
|
const rawData = await page.locator("#__NUXT_DATA__").innerText();
|
||||||
|
const data = parseNuxtData(JSON.parse(rawData), 4);
|
||||||
|
await pushData(data, "projects");
|
||||||
|
}
|
||||||
|
|
||||||
|
await enqueueLinks({ regexps: [PROJECT_URL] })
|
||||||
|
},
|
||||||
|
maxRequestsPerCrawl: 20,
|
||||||
|
headless: false,
|
||||||
|
launchContext: {
|
||||||
|
launcher: chromium
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await crawler.run(['https://www.funda.nl/zoeken/koop?construction_type=[%22newly_built%22]&type=[%22group%22]']);
|
||||||
35
src/resolve.ts
Normal file
35
src/resolve.ts
Normal file
File diff suppressed because one or more lines are too long
12
tsconfig.json
Normal file
12
tsconfig.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"extends": "@apify/tsconfig",
|
||||||
|
"compilerOptions": {
|
||||||
|
"module": "NodeNext",
|
||||||
|
"moduleResolution": "NodeNext",
|
||||||
|
"target": "ES2022",
|
||||||
|
"outDir": "dist",
|
||||||
|
"noUnusedLocals": false,
|
||||||
|
"lib": ["DOM"]
|
||||||
|
},
|
||||||
|
"include": ["./src/**/*"]
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user