some more changes
This commit is contained in:
5
.env.example
Normal file
5
.env.example
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
POSTGRES_HOST=172.17.0.1
|
||||||
|
POSTGRES_PORT=5432
|
||||||
|
POSTGRES_DB=fundamnit
|
||||||
|
POSTGRES_USER=fundamnit
|
||||||
|
POSTGRES_PASSWORD=fundamnit
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,3 +4,4 @@
|
|||||||
dist
|
dist
|
||||||
node_modules
|
node_modules
|
||||||
storage
|
storage
|
||||||
|
.env
|
||||||
|
|||||||
21
Justfile
21
Justfile
@@ -1,17 +1,20 @@
|
|||||||
set dotenv-load
|
set dotenv-load
|
||||||
|
|
||||||
run_sling:
|
run-scraper:
|
||||||
|
bun run start
|
||||||
|
|
||||||
|
run-sling:
|
||||||
docker run -it --rm \
|
docker run -it --rm \
|
||||||
-v ${PWD}/replication.yaml:/replication.yaml \
|
-v ${PWD}/sling/replication.yaml:/replication.yaml \
|
||||||
-v ${PWD}/storage:/storage \
|
-v ${PWD}/storage:/storage \
|
||||||
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
||||||
-e SLING_DISABLE_TELEMETRY=true \
|
-e SLING_DISABLE_TELEMETRY=true \
|
||||||
docker.io/slingdata/sling run -r /replication.yaml
|
docker.io/slingdata/sling run -r /replication.yaml
|
||||||
|
|
||||||
run:
|
run-dbt:
|
||||||
docker run -it --rm \
|
cd dbt && dbt run --static-analysis off
|
||||||
-v ${PWD}/replication.yaml:/replication.yaml \
|
|
||||||
-v ${PWD}/storage:/storage \
|
run-all:
|
||||||
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
just run-scraper
|
||||||
--entrypoint bash \
|
just run-sling
|
||||||
docker.io/slingdata/sling:v1.4.24
|
just run-dbt
|
||||||
|
|||||||
5
dbt/.gitignore
vendored
Normal file
5
dbt/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
|
||||||
|
target/
|
||||||
|
dbt_packages/
|
||||||
|
logs/
|
||||||
|
dbt_internal_packages/
|
||||||
15
dbt/README.md
Normal file
15
dbt/README.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
Welcome to your new dbt project!
|
||||||
|
|
||||||
|
### Using the starter project
|
||||||
|
|
||||||
|
Try running the following commands:
|
||||||
|
- dbt run
|
||||||
|
- dbt test
|
||||||
|
|
||||||
|
|
||||||
|
### Resources:
|
||||||
|
- Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
|
||||||
|
- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
|
||||||
|
- Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
|
||||||
|
- Find [dbt events](https://events.getdbt.com) near you
|
||||||
|
- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
|
||||||
0
dbt/analyses/.gitkeep
Normal file
0
dbt/analyses/.gitkeep
Normal file
33
dbt/dbt_project.yml
Normal file
33
dbt/dbt_project.yml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Name your project! Project names should contain only lowercase characters
|
||||||
|
# and underscores. A good package name should reflect your organization's
|
||||||
|
# name or the intended use of these models
|
||||||
|
name: "fundamnit"
|
||||||
|
version: "1.0.0"
|
||||||
|
|
||||||
|
# This setting configures which "profile" dbt uses for this project.
|
||||||
|
profile: "fundamnit"
|
||||||
|
|
||||||
|
# These configurations specify where dbt should look for different types of files.
|
||||||
|
# The `model-paths` config, for example, states that models in this project can be
|
||||||
|
# found in the "models/" directory. You probably won't need to change these!
|
||||||
|
model-paths: ["models"]
|
||||||
|
analysis-paths: ["analyses"]
|
||||||
|
test-paths: ["tests"]
|
||||||
|
seed-paths: ["seeds"]
|
||||||
|
macro-paths: ["macros"]
|
||||||
|
snapshot-paths: ["snapshots"]
|
||||||
|
|
||||||
|
clean-targets: # directories to be removed by `dbt clean`
|
||||||
|
- "target"
|
||||||
|
- "dbt_packages"
|
||||||
|
|
||||||
|
# Configuring models
|
||||||
|
# Full documentation: https://docs.getdbt.com/docs/configuring-models
|
||||||
|
|
||||||
|
# In this example config, we tell dbt to build all models in the example/
|
||||||
|
# directory as views. These settings can be overridden in the individual model
|
||||||
|
# files using the `{{ config(...) }}` macro.
|
||||||
|
models:
|
||||||
|
fundamnit:
|
||||||
|
staging:
|
||||||
|
+materialized: view
|
||||||
3
dbt/macros/extract_numeric.sql
Normal file
3
dbt/macros/extract_numeric.sql
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
{% macro extract_numeric(column_name) %}
|
||||||
|
NULLIF(regexp_replace({{ column_name }}, '[^0-9]','','g'), '')::numeric
|
||||||
|
{% endmacro %}
|
||||||
5
dbt/models/marts/listings.sql
Normal file
5
dbt/models/marts/listings.sql
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{{ config(materialized='table') }}
|
||||||
|
select
|
||||||
|
*
|
||||||
|
from
|
||||||
|
{{ ref('stg_listings') }}
|
||||||
6
dbt/models/staging/__sources.yml
Normal file
6
dbt/models/staging/__sources.yml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
version: 2
|
||||||
|
|
||||||
|
sources:
|
||||||
|
- name: funda
|
||||||
|
tables:
|
||||||
|
- name: raw_listings
|
||||||
53
dbt/models/staging/stg_listings.sql
Normal file
53
dbt/models/staging/stg_listings.sql
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
with source as (
|
||||||
|
select
|
||||||
|
*
|
||||||
|
from
|
||||||
|
{{ source('funda', 'raw_listings') }}
|
||||||
|
),
|
||||||
|
flattened_json as (
|
||||||
|
select
|
||||||
|
(data#>>'{globalId}')::int as listing_id,
|
||||||
|
(data#>>'{parentProject,globalId}')::int as parent_id,
|
||||||
|
|
||||||
|
(data#>>'{address,addressTitle}')::varchar as title,
|
||||||
|
|
||||||
|
(data#>>'{tracking,listing_offering_type}')::varchar as offering_type,
|
||||||
|
(data#>>'{tracking,listing_type}')::varchar as listing_type,
|
||||||
|
(data#>>'{tracking,listing_status}')::varchar as listing_status,
|
||||||
|
|
||||||
|
(data#>>'{advertising,targetingOptions,aantalkamers}')::int as rooms,
|
||||||
|
(data#>>'{featuresFastView,energyLabel}')::varchar as energy_label,
|
||||||
|
|
||||||
|
{{ extract_numeric("data#>>'{featuresFastView,livingArea}'") }} as living_area,
|
||||||
|
{{ extract_numeric("data#>>'{featuresFastView,plotArea}'") }} as plot_area,
|
||||||
|
|
||||||
|
(data#>>'{price,numericPrice}')::int as price,
|
||||||
|
(data#>>'{isSoldOrRented}')::boolean as is_sold,
|
||||||
|
|
||||||
|
(data#>>'{advertising,targetingOptions,balkon}')::boolean as has_balcony,
|
||||||
|
(data#>>'{advertising,targetingOptions,tuin}')::boolean as has_garden,
|
||||||
|
(data#>>'{advertising,targetingOptions,parkeergelegenheidopafgeslotenterrein}')::boolean as has_closed_parking,
|
||||||
|
(data#>>'{advertising,targetingOptions,parkeergelegenheidopeigenterrein}')::boolean as has_private_parking,
|
||||||
|
|
||||||
|
(data#>>'{address,country}')::varchar as country,
|
||||||
|
(data#>>'{address,province}')::varchar as province,
|
||||||
|
initcap((data#>>'{advertising,targetingOptions,gemeente}')) as municipality,
|
||||||
|
(data#>>'{address,city}')::varchar as city,
|
||||||
|
(data#>>'{address,neighborhood,name}')::varchar as neighborhood,
|
||||||
|
(data#>>'{address,postcode}')::varchar as postcode,
|
||||||
|
(data#>>'{tracking,listing_house_no}')::varchar as house_number,
|
||||||
|
(data#>>'{tracking,listing_house_no_addition}')::varchar as house_number_addition,
|
||||||
|
|
||||||
|
(data#>>'{coordinates,lng}')::float as longitude,
|
||||||
|
(data#>>'{coordinates,lat}')::float as latitude,
|
||||||
|
|
||||||
|
(data#>>'{googleMapsObjectUrl}')::varchar as google_maps_url,
|
||||||
|
|
||||||
|
(data#>>'{canonicalUrl}')::varchar as listing_url
|
||||||
|
from
|
||||||
|
source
|
||||||
|
)
|
||||||
|
select
|
||||||
|
*
|
||||||
|
from
|
||||||
|
flattened_json
|
||||||
5
dbt/package-lock.yml
Normal file
5
dbt/package-lock.yml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
packages:
|
||||||
|
- package: dbt-labs/dbt_utils
|
||||||
|
name: dbt_utils
|
||||||
|
version: 1.3.1
|
||||||
|
sha1_hash: a58234d0b6335a94dffedc7a19f7278eaed2ecda
|
||||||
3
dbt/packages.yml
Normal file
3
dbt/packages.yml
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
packages:
|
||||||
|
- package: dbt-labs/dbt_utils
|
||||||
|
version: 1.3.1
|
||||||
0
dbt/seeds/.gitkeep
Normal file
0
dbt/seeds/.gitkeep
Normal file
0
dbt/snapshots/.gitkeep
Normal file
0
dbt/snapshots/.gitkeep
Normal file
0
dbt/tests/.gitkeep
Normal file
0
dbt/tests/.gitkeep
Normal file
@@ -1,6 +1,6 @@
|
|||||||
services:
|
services:
|
||||||
postgres:
|
postgres:
|
||||||
image: docker.io/library/postgres:18
|
image: docker.io/postgis/postgis:18-3.6
|
||||||
ports:
|
ports:
|
||||||
- ${POSTGRES_PORT}:5432
|
- ${POSTGRES_PORT}:5432
|
||||||
environment:
|
environment:
|
||||||
|
|||||||
@@ -2,10 +2,10 @@ source: LOCAL
|
|||||||
target: POSTGRES
|
target: POSTGRES
|
||||||
|
|
||||||
defaults:
|
defaults:
|
||||||
mode: full-refresh
|
mode: snapshot
|
||||||
object: funda.projects
|
object: funda.raw_listings
|
||||||
source_options:
|
source_options:
|
||||||
format: json
|
format: json
|
||||||
|
|
||||||
streams:
|
streams:
|
||||||
"file:///storage/datasets/projects/":
|
"file:///storage/datasets/listings/":
|
||||||
46
src/main.ts
46
src/main.ts
@@ -1,13 +1,17 @@
|
|||||||
import { PlaywrightCrawler } from 'crawlee';
|
import { PlaywrightCrawler } from 'crawlee';
|
||||||
import { chromium, Locator, Page } from 'patchright';
|
import { chromium } from 'patchright';
|
||||||
import * as z from 'zod';
|
|
||||||
|
|
||||||
const PROJECT_URL = /https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*/;
|
const PROJECT_PAGE_URL = /^https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*\/\d+\/$/;
|
||||||
|
const LISTING_PAGE_URL = /^https:\/\/www\.funda\.nl\/detail\/\d+\/$/;
|
||||||
|
|
||||||
function parseNuxtData(data: unknown[], offset: number = 0) {
|
function parseNuxtData(data: unknown[], offset: number = 0, exclude: number[] = []) {
|
||||||
let item = data[offset];
|
let item = data[offset];
|
||||||
|
|
||||||
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined") {
|
if (exclude.includes(offset)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined" || item === null) {
|
||||||
return item;
|
return item;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -37,20 +41,38 @@ function parseNuxtData(data: unknown[], offset: number = 0) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const crawler = new PlaywrightCrawler({
|
const crawler = new PlaywrightCrawler({
|
||||||
async requestHandler({ request, page, enqueueLinks, pushData }) {
|
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
|
||||||
const title = await page.title();
|
log.info(`Processing ${request.url}`);
|
||||||
|
|
||||||
if (PROJECT_URL.test(request.url)) {
|
if (LISTING_PAGE_URL.test(request.url)) {
|
||||||
const rawData = await page.locator("#__NUXT_DATA__").innerText();
|
const rawData = await page.locator("#__NUXT_DATA__").innerText();
|
||||||
const data = parseNuxtData(JSON.parse(rawData), 4);
|
const data = parseNuxtData(JSON.parse(rawData), 6);
|
||||||
await pushData(data, "projects");
|
await pushData(data, "listings");
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
await enqueueLinks({ regexps: [PROJECT_URL] })
|
if (PROJECT_PAGE_URL.test(request.url)) {
|
||||||
|
const rawData = await page.locator("#__NUXT_DATA__").innerText();
|
||||||
|
const data = parseNuxtData(JSON.parse(rawData), 4);
|
||||||
|
|
||||||
|
const urls: string[] = [];
|
||||||
|
for (const objectType of data.projectListings.objectTypes) {
|
||||||
|
for (const listing of objectType.listings) {
|
||||||
|
urls.push(listing.listingUrl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await enqueueLinks({ urls: urls });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await enqueueLinks({ selector: '[aria-label=Volgende]' });
|
||||||
|
await enqueueLinks({ regexps: [PROJECT_PAGE_URL] });
|
||||||
},
|
},
|
||||||
maxRequestsPerCrawl: 20,
|
maxRequestsPerCrawl: 100,
|
||||||
headless: false,
|
headless: false,
|
||||||
launchContext: {
|
launchContext: {
|
||||||
|
// @ts-ignore
|
||||||
launcher: chromium
|
launcher: chromium
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user