some more changes
This commit is contained in:
5
.env.example
Normal file
5
.env.example
Normal file
@@ -0,0 +1,5 @@
|
||||
POSTGRES_HOST=172.17.0.1
|
||||
POSTGRES_PORT=5432
|
||||
POSTGRES_DB=fundamnit
|
||||
POSTGRES_USER=fundamnit
|
||||
POSTGRES_PASSWORD=fundamnit
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,3 +4,4 @@
|
||||
dist
|
||||
node_modules
|
||||
storage
|
||||
.env
|
||||
|
||||
21
Justfile
21
Justfile
@@ -1,17 +1,20 @@
|
||||
set dotenv-load
|
||||
|
||||
run_sling:
|
||||
run-scraper:
|
||||
bun run start
|
||||
|
||||
run-sling:
|
||||
docker run -it --rm \
|
||||
-v ${PWD}/replication.yaml:/replication.yaml \
|
||||
-v ${PWD}/sling/replication.yaml:/replication.yaml \
|
||||
-v ${PWD}/storage:/storage \
|
||||
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
||||
-e SLING_DISABLE_TELEMETRY=true \
|
||||
docker.io/slingdata/sling run -r /replication.yaml
|
||||
|
||||
run:
|
||||
docker run -it --rm \
|
||||
-v ${PWD}/replication.yaml:/replication.yaml \
|
||||
-v ${PWD}/storage:/storage \
|
||||
-e POSTGRES="postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB?sslmode=disable" \
|
||||
--entrypoint bash \
|
||||
docker.io/slingdata/sling:v1.4.24
|
||||
run-dbt:
|
||||
cd dbt && dbt run --static-analysis off
|
||||
|
||||
run-all:
|
||||
just run-scraper
|
||||
just run-sling
|
||||
just run-dbt
|
||||
|
||||
5
dbt/.gitignore
vendored
Normal file
5
dbt/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
|
||||
target/
|
||||
dbt_packages/
|
||||
logs/
|
||||
dbt_internal_packages/
|
||||
15
dbt/README.md
Normal file
15
dbt/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
Welcome to your new dbt project!
|
||||
|
||||
### Using the starter project
|
||||
|
||||
Try running the following commands:
|
||||
- dbt run
|
||||
- dbt test
|
||||
|
||||
|
||||
### Resources:
|
||||
- Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
|
||||
- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
|
||||
- Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
|
||||
- Find [dbt events](https://events.getdbt.com) near you
|
||||
- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
|
||||
0
dbt/analyses/.gitkeep
Normal file
0
dbt/analyses/.gitkeep
Normal file
33
dbt/dbt_project.yml
Normal file
33
dbt/dbt_project.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
# Name your project! Project names should contain only lowercase characters
|
||||
# and underscores. A good package name should reflect your organization's
|
||||
# name or the intended use of these models
|
||||
name: "fundamnit"
|
||||
version: "1.0.0"
|
||||
|
||||
# This setting configures which "profile" dbt uses for this project.
|
||||
profile: "fundamnit"
|
||||
|
||||
# These configurations specify where dbt should look for different types of files.
|
||||
# The `model-paths` config, for example, states that models in this project can be
|
||||
# found in the "models/" directory. You probably won't need to change these!
|
||||
model-paths: ["models"]
|
||||
analysis-paths: ["analyses"]
|
||||
test-paths: ["tests"]
|
||||
seed-paths: ["seeds"]
|
||||
macro-paths: ["macros"]
|
||||
snapshot-paths: ["snapshots"]
|
||||
|
||||
clean-targets: # directories to be removed by `dbt clean`
|
||||
- "target"
|
||||
- "dbt_packages"
|
||||
|
||||
# Configuring models
|
||||
# Full documentation: https://docs.getdbt.com/docs/configuring-models
|
||||
|
||||
# In this example config, we tell dbt to build all models in the example/
|
||||
# directory as views. These settings can be overridden in the individual model
|
||||
# files using the `{{ config(...) }}` macro.
|
||||
models:
|
||||
fundamnit:
|
||||
staging:
|
||||
+materialized: view
|
||||
3
dbt/macros/extract_numeric.sql
Normal file
3
dbt/macros/extract_numeric.sql
Normal file
@@ -0,0 +1,3 @@
|
||||
{% macro extract_numeric(column_name) %}
|
||||
NULLIF(regexp_replace({{ column_name }}, '[^0-9]','','g'), '')::numeric
|
||||
{% endmacro %}
|
||||
5
dbt/models/marts/listings.sql
Normal file
5
dbt/models/marts/listings.sql
Normal file
@@ -0,0 +1,5 @@
|
||||
{{ config(materialized='table') }}
|
||||
select
|
||||
*
|
||||
from
|
||||
{{ ref('stg_listings') }}
|
||||
6
dbt/models/staging/__sources.yml
Normal file
6
dbt/models/staging/__sources.yml
Normal file
@@ -0,0 +1,6 @@
|
||||
version: 2
|
||||
|
||||
sources:
|
||||
- name: funda
|
||||
tables:
|
||||
- name: raw_listings
|
||||
53
dbt/models/staging/stg_listings.sql
Normal file
53
dbt/models/staging/stg_listings.sql
Normal file
@@ -0,0 +1,53 @@
|
||||
with source as (
|
||||
select
|
||||
*
|
||||
from
|
||||
{{ source('funda', 'raw_listings') }}
|
||||
),
|
||||
flattened_json as (
|
||||
select
|
||||
(data#>>'{globalId}')::int as listing_id,
|
||||
(data#>>'{parentProject,globalId}')::int as parent_id,
|
||||
|
||||
(data#>>'{address,addressTitle}')::varchar as title,
|
||||
|
||||
(data#>>'{tracking,listing_offering_type}')::varchar as offering_type,
|
||||
(data#>>'{tracking,listing_type}')::varchar as listing_type,
|
||||
(data#>>'{tracking,listing_status}')::varchar as listing_status,
|
||||
|
||||
(data#>>'{advertising,targetingOptions,aantalkamers}')::int as rooms,
|
||||
(data#>>'{featuresFastView,energyLabel}')::varchar as energy_label,
|
||||
|
||||
{{ extract_numeric("data#>>'{featuresFastView,livingArea}'") }} as living_area,
|
||||
{{ extract_numeric("data#>>'{featuresFastView,plotArea}'") }} as plot_area,
|
||||
|
||||
(data#>>'{price,numericPrice}')::int as price,
|
||||
(data#>>'{isSoldOrRented}')::boolean as is_sold,
|
||||
|
||||
(data#>>'{advertising,targetingOptions,balkon}')::boolean as has_balcony,
|
||||
(data#>>'{advertising,targetingOptions,tuin}')::boolean as has_garden,
|
||||
(data#>>'{advertising,targetingOptions,parkeergelegenheidopafgeslotenterrein}')::boolean as has_closed_parking,
|
||||
(data#>>'{advertising,targetingOptions,parkeergelegenheidopeigenterrein}')::boolean as has_private_parking,
|
||||
|
||||
(data#>>'{address,country}')::varchar as country,
|
||||
(data#>>'{address,province}')::varchar as province,
|
||||
initcap((data#>>'{advertising,targetingOptions,gemeente}')) as municipality,
|
||||
(data#>>'{address,city}')::varchar as city,
|
||||
(data#>>'{address,neighborhood,name}')::varchar as neighborhood,
|
||||
(data#>>'{address,postcode}')::varchar as postcode,
|
||||
(data#>>'{tracking,listing_house_no}')::varchar as house_number,
|
||||
(data#>>'{tracking,listing_house_no_addition}')::varchar as house_number_addition,
|
||||
|
||||
(data#>>'{coordinates,lng}')::float as longitude,
|
||||
(data#>>'{coordinates,lat}')::float as latitude,
|
||||
|
||||
(data#>>'{googleMapsObjectUrl}')::varchar as google_maps_url,
|
||||
|
||||
(data#>>'{canonicalUrl}')::varchar as listing_url
|
||||
from
|
||||
source
|
||||
)
|
||||
select
|
||||
*
|
||||
from
|
||||
flattened_json
|
||||
5
dbt/package-lock.yml
Normal file
5
dbt/package-lock.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
packages:
|
||||
- package: dbt-labs/dbt_utils
|
||||
name: dbt_utils
|
||||
version: 1.3.1
|
||||
sha1_hash: a58234d0b6335a94dffedc7a19f7278eaed2ecda
|
||||
3
dbt/packages.yml
Normal file
3
dbt/packages.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
packages:
|
||||
- package: dbt-labs/dbt_utils
|
||||
version: 1.3.1
|
||||
0
dbt/seeds/.gitkeep
Normal file
0
dbt/seeds/.gitkeep
Normal file
0
dbt/snapshots/.gitkeep
Normal file
0
dbt/snapshots/.gitkeep
Normal file
0
dbt/tests/.gitkeep
Normal file
0
dbt/tests/.gitkeep
Normal file
@@ -1,6 +1,6 @@
|
||||
services:
|
||||
postgres:
|
||||
image: docker.io/library/postgres:18
|
||||
image: docker.io/postgis/postgis:18-3.6
|
||||
ports:
|
||||
- ${POSTGRES_PORT}:5432
|
||||
environment:
|
||||
|
||||
@@ -2,10 +2,10 @@ source: LOCAL
|
||||
target: POSTGRES
|
||||
|
||||
defaults:
|
||||
mode: full-refresh
|
||||
object: funda.projects
|
||||
mode: snapshot
|
||||
object: funda.raw_listings
|
||||
source_options:
|
||||
format: json
|
||||
|
||||
streams:
|
||||
"file:///storage/datasets/projects/":
|
||||
"file:///storage/datasets/listings/":
|
||||
46
src/main.ts
46
src/main.ts
@@ -1,13 +1,17 @@
|
||||
import { PlaywrightCrawler } from 'crawlee';
|
||||
import { chromium, Locator, Page } from 'patchright';
|
||||
import * as z from 'zod';
|
||||
import { chromium } from 'patchright';
|
||||
|
||||
const PROJECT_URL = /https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*/;
|
||||
const PROJECT_PAGE_URL = /^https:\/\/www\.funda\.nl\/detail\/nieuwbouw\/.*\/\d+\/$/;
|
||||
const LISTING_PAGE_URL = /^https:\/\/www\.funda\.nl\/detail\/\d+\/$/;
|
||||
|
||||
function parseNuxtData(data: unknown[], offset: number = 0) {
|
||||
function parseNuxtData(data: unknown[], offset: number = 0, exclude: number[] = []) {
|
||||
let item = data[offset];
|
||||
|
||||
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined") {
|
||||
if (exclude.includes(offset)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (typeof item === "string" || typeof item === "number" || typeof item === "boolean" || typeof item === "undefined" || item === null) {
|
||||
return item;
|
||||
}
|
||||
|
||||
@@ -37,20 +41,38 @@ function parseNuxtData(data: unknown[], offset: number = 0) {
|
||||
}
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
async requestHandler({ request, page, enqueueLinks, pushData }) {
|
||||
const title = await page.title();
|
||||
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
|
||||
log.info(`Processing ${request.url}`);
|
||||
|
||||
if (PROJECT_URL.test(request.url)) {
|
||||
if (LISTING_PAGE_URL.test(request.url)) {
|
||||
const rawData = await page.locator("#__NUXT_DATA__").innerText();
|
||||
const data = parseNuxtData(JSON.parse(rawData), 4);
|
||||
await pushData(data, "projects");
|
||||
const data = parseNuxtData(JSON.parse(rawData), 6);
|
||||
await pushData(data, "listings");
|
||||
return;
|
||||
}
|
||||
|
||||
await enqueueLinks({ regexps: [PROJECT_URL] })
|
||||
if (PROJECT_PAGE_URL.test(request.url)) {
|
||||
const rawData = await page.locator("#__NUXT_DATA__").innerText();
|
||||
const data = parseNuxtData(JSON.parse(rawData), 4);
|
||||
|
||||
const urls: string[] = [];
|
||||
for (const objectType of data.projectListings.objectTypes) {
|
||||
for (const listing of objectType.listings) {
|
||||
urls.push(listing.listingUrl);
|
||||
}
|
||||
}
|
||||
|
||||
await enqueueLinks({ urls: urls });
|
||||
return;
|
||||
}
|
||||
|
||||
await enqueueLinks({ selector: '[aria-label=Volgende]' });
|
||||
await enqueueLinks({ regexps: [PROJECT_PAGE_URL] });
|
||||
},
|
||||
maxRequestsPerCrawl: 20,
|
||||
maxRequestsPerCrawl: 100,
|
||||
headless: false,
|
||||
launchContext: {
|
||||
// @ts-ignore
|
||||
launcher: chromium
|
||||
}
|
||||
});
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user