Initial commit
This commit is contained in:
8
.dockerignore
Normal file
8
.dockerignore
Normal file
@@ -0,0 +1,8 @@
|
||||
# configurations
|
||||
.idea
|
||||
|
||||
# crawlee storage folder
|
||||
storage
|
||||
|
||||
# installed files
|
||||
node_modules
|
||||
5
.env
Normal file
5
.env
Normal file
@@ -0,0 +1,5 @@
|
||||
POSTGRES_HOST=172.17.0.1
|
||||
POSTGRES_PORT=5432
|
||||
POSTGRES_DB=fundamnit
|
||||
POSTGRES_USER=fundamnit
|
||||
POSTGRES_PASSWORD=fundamnit
|
||||
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
# This file tells Git which files shouldn't be added to source control
|
||||
|
||||
.idea
|
||||
dist
|
||||
node_modules
|
||||
storage
|
||||
51
Dockerfile
Normal file
51
Dockerfile
Normal file
@@ -0,0 +1,51 @@
|
||||
# Specify the base Docker image. You can read more about
|
||||
# the available images at https://crawlee.dev/docs/guides/docker-images
|
||||
# You can also use any other image from Docker Hub.
|
||||
FROM apify/actor-node-playwright-chrome:20 AS builder
|
||||
|
||||
# Copy just package.json and package-lock.json
|
||||
# to speed up the build using Docker layer cache.
|
||||
COPY --chown=myuser package*.json ./
|
||||
|
||||
# Install all dependencies. Don't audit to speed up the installation.
|
||||
RUN npm install --include=dev --audit=false
|
||||
|
||||
# Next, copy the source files using the user set
|
||||
# in the base image.
|
||||
COPY --chown=myuser . ./
|
||||
|
||||
# Install all dependencies and build the project.
|
||||
# Don't audit to speed up the installation.
|
||||
RUN npm run build
|
||||
|
||||
# Create final image
|
||||
FROM apify/actor-node-playwright-chrome:20
|
||||
|
||||
# Copy only built JS files from builder image
|
||||
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
|
||||
|
||||
# Copy just package.json and package-lock.json
|
||||
# to speed up the build using Docker layer cache.
|
||||
COPY --chown=myuser package*.json ./
|
||||
|
||||
# Install NPM packages, skip optional and development dependencies to
|
||||
# keep the image small. Avoid logging too much and print the dependency
|
||||
# tree for debugging
|
||||
RUN npm --quiet set progress=false \
|
||||
&& npm install --omit=dev --omit=optional \
|
||||
&& echo "Installed NPM packages:" \
|
||||
&& (npm list --omit=dev --all || true) \
|
||||
&& echo "Node.js version:" \
|
||||
&& node --version \
|
||||
&& echo "NPM version:" \
|
||||
&& npm --version
|
||||
|
||||
# Next, copy the remaining files and directories with the source code.
|
||||
# Since we do this after NPM install, quick build will be really fast
|
||||
# for most source file changes.
|
||||
COPY --chown=myuser . ./
|
||||
|
||||
|
||||
# Run the image. If you know you won't need headful browsers,
|
||||
# you can remove the XVFB start script for a micro perf gain.
|
||||
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
|
||||
17
Justfile
Normal file
17
Justfile
Normal file
@@ -0,0 +1,17 @@
|
||||
set dotenv-load
|
||||
|
||||
run_sling:
|
||||
docker run -it --rm \
|
||||
-v ${PWD}/replication.yaml:/replication.yaml \
|
||||
-v ${PWD}/storage:/storage \
|
||||
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
||||
-e SLING_DISABLE_TELEMETRY=true \
|
||||
docker.io/slingdata/sling run -r /replication.yaml
|
||||
|
||||
run:
|
||||
docker run -it --rm \
|
||||
-v ${PWD}/replication.yaml:/replication.yaml \
|
||||
-v ${PWD}/storage:/storage \
|
||||
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
||||
--entrypoint bash \
|
||||
docker.io/slingdata/sling:v1.4.24
|
||||
9
README.md
Normal file
9
README.md
Normal file
@@ -0,0 +1,9 @@
|
||||
# Getting started with Crawlee
|
||||
|
||||
This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
|
||||
|
||||
You can find more examples and documentation at the following links:
|
||||
|
||||
- [Step-by-step tutorial](https://crawlee.dev/js/docs/introduction) for Crawlee
|
||||
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/js/api/playwright-crawler/class/PlaywrightCrawler)
|
||||
- Other [examples](https://crawlee.dev/js/docs/examples/playwright-crawler)
|
||||
9
docker-compose.yaml
Normal file
9
docker-compose.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
services:
|
||||
postgres:
|
||||
image: docker.io/library/postgres:18
|
||||
ports:
|
||||
- ${POSTGRES_PORT}:5432
|
||||
environment:
|
||||
POSTGRES_USER: ${POSTGRES_USER}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||
POSTGRES_DB: ${POSTGRES_DB}
|
||||
4175
package-lock.json
generated
Normal file
4175
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
27
package.json
Normal file
27
package.json
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "fundamnit",
|
||||
"version": "0.0.1",
|
||||
"type": "module",
|
||||
"description": "This is an example of a Crawlee project.",
|
||||
"dependencies": {
|
||||
"crawlee": "^3.0.0",
|
||||
"patchright": "^1.56.1",
|
||||
"playwright": "*",
|
||||
"zod": "^4.1.12"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@apify/tsconfig": "^0.1.0",
|
||||
"tsx": "^4.4.0",
|
||||
"typescript": "~5.9.0",
|
||||
"@types/node": "^22.0.0"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "npm run start:dev",
|
||||
"start:prod": "node dist/main.js",
|
||||
"start:dev": "tsx src/main.ts",
|
||||
"build": "tsc",
|
||||
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
|
||||
},
|
||||
"author": "It's not you it's me",
|
||||
"license": "ISC"
|
||||
}
|
||||
11
replication.yaml
Normal file
11
replication.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
source: LOCAL
|
||||
target: POSTGRES
|
||||
|
||||
defaults:
|
||||
mode: full-refresh
|
||||
object: funda.projects
|
||||
source_options:
|
||||
format: json
|
||||
|
||||
streams:
|
||||
"file:///storage/datasets/projects/":
|
||||
58
src/main.ts
Normal file
58
src/main.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
import { PlaywrightCrawler } from 'crawlee';
|
||||
import { chromium, Locator, Page } from 'patchright';
|
||||
import * as z from 'zod';
|
||||
|
||||
const PROJECT_URL = /https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*/;
|
||||
|
||||
function parseNuxtData(data: unknown[], offset: number = 0) {
|
||||
let item = data[offset];
|
||||
|
||||
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined") {
|
||||
return item;
|
||||
}
|
||||
|
||||
if (Array.isArray(item)) {
|
||||
if (item[0] === "ShallowReactive") {
|
||||
return parseNuxtData(data, item[1]);
|
||||
}
|
||||
|
||||
let arr = [];
|
||||
for (const idx of item) {
|
||||
arr.push(parseNuxtData(data, idx))
|
||||
}
|
||||
|
||||
return arr;
|
||||
}
|
||||
|
||||
if (typeof item === 'object' && item !== null) {
|
||||
let obj: Record<any, any> = {}
|
||||
for (const [key, value] of Object.entries(item)) {
|
||||
obj[key] = parseNuxtData(data, parseInt(value));
|
||||
}
|
||||
|
||||
return obj;
|
||||
}
|
||||
|
||||
throw new Error(`Unsupported item type: ${item}`)
|
||||
}
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
async requestHandler({ request, page, enqueueLinks, pushData }) {
|
||||
const title = await page.title();
|
||||
|
||||
if (PROJECT_URL.test(request.url)) {
|
||||
const rawData = await page.locator("#__NUXT_DATA__").innerText();
|
||||
const data = parseNuxtData(JSON.parse(rawData), 4);
|
||||
await pushData(data, "projects");
|
||||
}
|
||||
|
||||
await enqueueLinks({ regexps: [PROJECT_URL] })
|
||||
},
|
||||
maxRequestsPerCrawl: 20,
|
||||
headless: false,
|
||||
launchContext: {
|
||||
launcher: chromium
|
||||
}
|
||||
});
|
||||
|
||||
await crawler.run(['https://www.funda.nl/zoeken/koop?construction_type=[%22newly_built%22]&type=[%22group%22]']);
|
||||
35
src/resolve.ts
Normal file
35
src/resolve.ts
Normal file
File diff suppressed because one or more lines are too long
12
tsconfig.json
Normal file
12
tsconfig.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"extends": "@apify/tsconfig",
|
||||
"compilerOptions": {
|
||||
"module": "NodeNext",
|
||||
"moduleResolution": "NodeNext",
|
||||
"target": "ES2022",
|
||||
"outDir": "dist",
|
||||
"noUnusedLocals": false,
|
||||
"lib": ["DOM"]
|
||||
},
|
||||
"include": ["./src/**/*"]
|
||||
}
|
||||
Reference in New Issue
Block a user