some more changes

This commit is contained in:
2026-02-17 23:11:16 +01:00
parent f88d870db5
commit 24a505e9bf
20 changed files with 184 additions and 60 deletions

5
.env.example Normal file
View File

@@ -0,0 +1,5 @@
POSTGRES_HOST=172.17.0.1
POSTGRES_PORT=5432
POSTGRES_DB=fundamnit
POSTGRES_USER=fundamnit
POSTGRES_PASSWORD=fundamnit

1
.gitignore vendored
View File

@@ -4,3 +4,4 @@
dist dist
node_modules node_modules
storage storage
.env

View File

@@ -1,17 +1,20 @@
set dotenv-load set dotenv-load
run_sling: run-scraper:
bun run start
run-sling:
docker run -it --rm \ docker run -it --rm \
-v ${PWD}/replication.yaml:/replication.yaml \ -v ${PWD}/sling/replication.yaml:/replication.yaml \
-v ${PWD}/storage:/storage \ -v ${PWD}/storage:/storage \
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \ -e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
-e SLING_DISABLE_TELEMETRY=true \ -e SLING_DISABLE_TELEMETRY=true \
docker.io/slingdata/sling run -r /replication.yaml docker.io/slingdata/sling run -r /replication.yaml
run: run-dbt:
docker run -it --rm \ cd dbt && dbt run --static-analysis off
-v ${PWD}/replication.yaml:/replication.yaml \
-v ${PWD}/storage:/storage \ run-all:
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \ just run-scraper
--entrypoint bash \ just run-sling
docker.io/slingdata/sling:v1.4.24 just run-dbt

5
dbt/.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
target/
dbt_packages/
logs/
dbt_internal_packages/

15
dbt/README.md Normal file
View File

@@ -0,0 +1,15 @@
Welcome to your new dbt project!
### Using the starter project
Try running the following commands:
- dbt run
- dbt test
### Resources:
- Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
- Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
- Find [dbt events](https://events.getdbt.com) near you
- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices

0
dbt/analyses/.gitkeep Normal file
View File

33
dbt/dbt_project.yml Normal file
View File

@@ -0,0 +1,33 @@
# Name your project! Project names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: "fundamnit"
version: "1.0.0"
# This setting configures which "profile" dbt uses for this project.
profile: "fundamnit"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that models in this project can be
# found in the "models/" directory. You probably won't need to change these!
model-paths: ["models"]
analysis-paths: ["analyses"]
test-paths: ["tests"]
seed-paths: ["seeds"]
macro-paths: ["macros"]
snapshot-paths: ["snapshots"]
clean-targets: # directories to be removed by `dbt clean`
- "target"
- "dbt_packages"
# Configuring models
# Full documentation: https://docs.getdbt.com/docs/configuring-models
# In this example config, we tell dbt to build all models in the example/
# directory as views. These settings can be overridden in the individual model
# files using the `{{ config(...) }}` macro.
models:
fundamnit:
staging:
+materialized: view

View File

@@ -0,0 +1,3 @@
{% macro extract_numeric(column_name) %}
NULLIF(regexp_replace({{ column_name }}, '[^0-9]','','g'), '')::numeric
{% endmacro %}

View File

@@ -0,0 +1,5 @@
{{ config(materialized='table') }}
select
*
from
{{ ref('stg_listings') }}

View File

@@ -0,0 +1,6 @@
version: 2
sources:
- name: funda
tables:
- name: raw_listings

View File

@@ -0,0 +1,53 @@
with source as (
select
*
from
{{ source('funda', 'raw_listings') }}
),
flattened_json as (
select
(data#>>'{globalId}')::int as listing_id,
(data#>>'{parentProject,globalId}')::int as parent_id,
(data#>>'{address,addressTitle}')::varchar as title,
(data#>>'{tracking,listing_offering_type}')::varchar as offering_type,
(data#>>'{tracking,listing_type}')::varchar as listing_type,
(data#>>'{tracking,listing_status}')::varchar as listing_status,
(data#>>'{advertising,targetingOptions,aantalkamers}')::int as rooms,
(data#>>'{featuresFastView,energyLabel}')::varchar as energy_label,
{{ extract_numeric("data#>>'{featuresFastView,livingArea}'") }} as living_area,
{{ extract_numeric("data#>>'{featuresFastView,plotArea}'") }} as plot_area,
(data#>>'{price,numericPrice}')::int as price,
(data#>>'{isSoldOrRented}')::boolean as is_sold,
(data#>>'{advertising,targetingOptions,balkon}')::boolean as has_balcony,
(data#>>'{advertising,targetingOptions,tuin}')::boolean as has_garden,
(data#>>'{advertising,targetingOptions,parkeergelegenheidopafgeslotenterrein}')::boolean as has_closed_parking,
(data#>>'{advertising,targetingOptions,parkeergelegenheidopeigenterrein}')::boolean as has_private_parking,
(data#>>'{address,country}')::varchar as country,
(data#>>'{address,province}')::varchar as province,
initcap((data#>>'{advertising,targetingOptions,gemeente}')) as municipality,
(data#>>'{address,city}')::varchar as city,
(data#>>'{address,neighborhood,name}')::varchar as neighborhood,
(data#>>'{address,postcode}')::varchar as postcode,
(data#>>'{tracking,listing_house_no}')::varchar as house_number,
(data#>>'{tracking,listing_house_no_addition}')::varchar as house_number_addition,
(data#>>'{coordinates,lng}')::float as longitude,
(data#>>'{coordinates,lat}')::float as latitude,
(data#>>'{googleMapsObjectUrl}')::varchar as google_maps_url,
(data#>>'{canonicalUrl}')::varchar as listing_url
from
source
)
select
*
from
flattened_json

5
dbt/package-lock.yml Normal file
View File

@@ -0,0 +1,5 @@
packages:
- package: dbt-labs/dbt_utils
name: dbt_utils
version: 1.3.1
sha1_hash: a58234d0b6335a94dffedc7a19f7278eaed2ecda

3
dbt/packages.yml Normal file
View File

@@ -0,0 +1,3 @@
packages:
- package: dbt-labs/dbt_utils
version: 1.3.1

0
dbt/seeds/.gitkeep Normal file
View File

0
dbt/snapshots/.gitkeep Normal file
View File

0
dbt/tests/.gitkeep Normal file
View File

View File

@@ -1,6 +1,6 @@
services: services:
postgres: postgres:
image: docker.io/library/postgres:18 image: docker.io/postgis/postgis:18-3.6
ports: ports:
- ${POSTGRES_PORT}:5432 - ${POSTGRES_PORT}:5432
environment: environment:

View File

@@ -2,10 +2,10 @@ source: LOCAL
target: POSTGRES target: POSTGRES
defaults: defaults:
mode: full-refresh mode: snapshot
object: funda.projects object: funda.raw_listings
source_options: source_options:
format: json format: json
streams: streams:
"file:///storage/datasets/projects/": "file:///storage/datasets/listings/":

View File

@@ -1,13 +1,17 @@
import { PlaywrightCrawler } from 'crawlee'; import { PlaywrightCrawler } from 'crawlee';
import { chromium, Locator, Page } from 'patchright'; import { chromium } from 'patchright';
import * as z from 'zod';
const PROJECT_URL = /https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*/; const PROJECT_PAGE_URL = /^https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*\/\d+\/$/;
const LISTING_PAGE_URL = /^https:\/\/www\.funda\.nl\/detail\/\d+\/$/;
function parseNuxtData(data: unknown[], offset: number = 0) { function parseNuxtData(data: unknown[], offset: number = 0, exclude: number[] = []) {
let item = data[offset]; let item = data[offset];
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined") { if (exclude.includes(offset)) {
return;
}
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined" || item === null) {
return item; return item;
} }
@@ -37,20 +41,38 @@ function parseNuxtData(data: unknown[], offset: number = 0) {
} }
const crawler = new PlaywrightCrawler({ const crawler = new PlaywrightCrawler({
async requestHandler({ request, page, enqueueLinks, pushData }) { async requestHandler({ request, page, enqueueLinks, log, pushData }) {
const title = await page.title(); log.info(`Processing ${request.url}`);
if (PROJECT_URL.test(request.url)) { if (LISTING_PAGE_URL.test(request.url)) {
const rawData = await page.locator("#__NUXT_DATA__").innerText(); const rawData = await page.locator("#__NUXT_DATA__").innerText();
const data = parseNuxtData(JSON.parse(rawData), 4); const data = parseNuxtData(JSON.parse(rawData), 6);
await pushData(data, "projects"); await pushData(data, "listings");
return;
} }
await enqueueLinks({ regexps: [PROJECT_URL] }) if (PROJECT_PAGE_URL.test(request.url)) {
const rawData = await page.locator("#__NUXT_DATA__").innerText();
const data = parseNuxtData(JSON.parse(rawData), 4);
const urls: string[] = [];
for (const objectType of data.projectListings.objectTypes) {
for (const listing of objectType.listings) {
urls.push(listing.listingUrl);
}
}
await enqueueLinks({ urls: urls });
return;
}
await enqueueLinks({ selector: '[aria-label=Volgende]' });
await enqueueLinks({ regexps: [PROJECT_PAGE_URL] });
}, },
maxRequestsPerCrawl: 20, maxRequestsPerCrawl: 100,
headless: false, headless: false,
launchContext: { launchContext: {
// @ts-ignore
launcher: chromium launcher: chromium
} }
}); });

File diff suppressed because one or more lines are too long