Initial commit

This commit is contained in:
2025-10-27 11:17:29 +01:00
commit f88d870db5
14 changed files with 4423 additions and 0 deletions

8
.dockerignore Normal file
View File

@@ -0,0 +1,8 @@
# configurations
.idea
# crawlee storage folder
storage
# installed files
node_modules

5
.env Normal file
View File

@@ -0,0 +1,5 @@
POSTGRES_HOST=172.17.0.1
POSTGRES_PORT=5432
POSTGRES_DB=fundamnit
POSTGRES_USER=fundamnit
POSTGRES_PASSWORD=fundamnit

6
.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
# This file tells Git which files shouldn't be added to source control
.idea
dist
node_modules
storage

51
Dockerfile Normal file
View File

@@ -0,0 +1,51 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:20 AS builder
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false
# Next, copy the source files using the user set
# in the base image.
COPY --chown=myuser . ./
# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build
# Create final image
FROM apify/actor-node-playwright-chrome:20
# Copy only built JS files from builder image
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

17
Justfile Normal file
View File

@@ -0,0 +1,17 @@
set dotenv-load
run_sling:
docker run -it --rm \
-v ${PWD}/replication.yaml:/replication.yaml \
-v ${PWD}/storage:/storage \
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
-e SLING_DISABLE_TELEMETRY=true \
docker.io/slingdata/sling run -r /replication.yaml
run:
docker run -it --rm \
-v ${PWD}/replication.yaml:/replication.yaml \
-v ${PWD}/storage:/storage \
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
--entrypoint bash \
docker.io/slingdata/sling:v1.4.24

9
README.md Normal file
View File

@@ -0,0 +1,9 @@
# Getting started with Crawlee
This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
You can find more examples and documentation at the following links:
- [Step-by-step tutorial](https://crawlee.dev/js/docs/introduction) for Crawlee
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/js/api/playwright-crawler/class/PlaywrightCrawler)
- Other [examples](https://crawlee.dev/js/docs/examples/playwright-crawler)

BIN
bun.lockb Executable file

Binary file not shown.

9
docker-compose.yaml Normal file
View File

@@ -0,0 +1,9 @@
services:
postgres:
image: docker.io/library/postgres:18
ports:
- ${POSTGRES_PORT}:5432
environment:
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${POSTGRES_DB}

4175
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

27
package.json Normal file
View File

@@ -0,0 +1,27 @@
{
"name": "fundamnit",
"version": "0.0.1",
"type": "module",
"description": "This is an example of a Crawlee project.",
"dependencies": {
"crawlee": "^3.0.0",
"patchright": "^1.56.1",
"playwright": "*",
"zod": "^4.1.12"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"tsx": "^4.4.0",
"typescript": "~5.9.0",
"@types/node": "^22.0.0"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}

11
replication.yaml Normal file
View File

@@ -0,0 +1,11 @@
source: LOCAL
target: POSTGRES
defaults:
mode: full-refresh
object: funda.projects
source_options:
format: json
streams:
"file:///storage/datasets/projects/":

58
src/main.ts Normal file
View File

@@ -0,0 +1,58 @@
import { PlaywrightCrawler } from 'crawlee';
import { chromium, Locator, Page } from 'patchright';
import * as z from 'zod';
const PROJECT_URL = /https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*/;
function parseNuxtData(data: unknown[], offset: number = 0) {
let item = data[offset];
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined") {
return item;
}
if (Array.isArray(item)) {
if (item[0] === "ShallowReactive") {
return parseNuxtData(data, item[1]);
}
let arr = [];
for (const idx of item) {
arr.push(parseNuxtData(data, idx))
}
return arr;
}
if (typeof item === 'object' && item !== null) {
let obj: Record<any, any> = {}
for (const [key, value] of Object.entries(item)) {
obj[key] = parseNuxtData(data, parseInt(value));
}
return obj;
}
throw new Error(`Unsupported item type: ${item}`)
}
const crawler = new PlaywrightCrawler({
async requestHandler({ request, page, enqueueLinks, pushData }) {
const title = await page.title();
if (PROJECT_URL.test(request.url)) {
const rawData = await page.locator("#__NUXT_DATA__").innerText();
const data = parseNuxtData(JSON.parse(rawData), 4);
await pushData(data, "projects");
}
await enqueueLinks({ regexps: [PROJECT_URL] })
},
maxRequestsPerCrawl: 20,
headless: false,
launchContext: {
launcher: chromium
}
});
await crawler.run(['https://www.funda.nl/zoeken/koop?construction_type=[%22newly_built%22]&type=[%22group%22]']);

35
src/resolve.ts Normal file

File diff suppressed because one or more lines are too long

12
tsconfig.json Normal file
View File

@@ -0,0 +1,12 @@
{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"lib": ["DOM"]
},
"include": ["./src/**/*"]
}