analyse_legislatives_2024/analyse-duckdb.sql

181 lines
4.9 KiB
SQL

-------------------------
-- Prep
-------------------------
SET file_search_path = '/home/daifukusan/Documents/analysis/legislatives2024/res/';
CREATE OR REPLACE TEMP TABLE nuances AS
SELECT
*,
CASE
WHEN orientation = 'Extrême Gauche' THEN -2
WHEN orientation = 'Gauche' THEN -1
WHEN orientation = 'Centre' THEN 0
WHEN orientation = 'Droite' THEN 1
WHEN orientation = 'Extrême Droite' THEN 2
END AS 'orientation_num'
FROM read_csv_auto('nuances.csv');
CREATE OR REPLACE TEMP TABLE raw AS
SELECT
regexp_extract(region, '\((.+)\)', 1) AS region_code,
region,
regexp_extract(circo, '\((.+)\)', 1) AS circo_code,
circo,
trim(results->'$[0]', '"') AS nuance,
trim(results->'$[1]', '"') AS candidat,
CAST(results->'$[2]' AS BIGINT) AS voix,
CAST(results->'$[3]' AS DECIMAL) AS inscrits,
CAST(results->'$[4]' AS DECIMAL) AS exprimes,
trim(results->'$[5]', '"') AS qualif,
FROM (
SELECT region, circo, unnest(results) AS results
FROM read_json_auto('legislatives_2024_fr_results.json')
) AS t;
CREATE OR REPLACE TEMP TABLE undecided AS
WITH with_qualif AS (
SELECT DISTINCT region, circo
FROM raw
WHERE qualif = 'QUALIF T2'
)
SELECT r.* FROM raw AS r
JOIN with_qualif AS u
ON u.region = r.region
AND u.circo = r.circo;
-------------------
-- Analyse
-------------------
-- Configurations de second tours par ordre de voix par qualifiés
WITH versus AS (
SELECT region, circo, string_agg(nuance, ', ' ORDER BY exprimes) AS vs
FROM raw
WHERE qualif = 'QUALIF T2'
GROUP BY region, circo
ORDER BY region, circo
)
SELECT
count(*) AS "nombre",
vs AS "configuration" FROM versus
GROUP BY vs
ORDER BY "nombre" DESC;
-- Nombre de 2n tours
SELECT
CASE WHEN qualif = 'OUI' THEN 'NON' ELSE 'OUI' END AS '2nd tour ?',
count(*) as 'nombre'
FROM (SELECT DISTINCT region, circo, qualif FROM raw WHERE qualif in ('QUALIF T2', 'OUI')) AS t
GROUP BY qualif;
-- Nombre de circonscriptions qui auront un 2nd tour dans lesquelles NFP est arrivé à telle ou telle position au premier tour:
WITH ordered AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY region, circo ORDER BY exprimes DESC) AS pos
FROM raw
WHERE qualif = 'QUALIF T2'
)
SELECT pos AS "Position 1er tour", count(*) AS "Nombre de Circonscription" FROM ordered
WHERE nuance = 'UG'
GROUP BY pos
ORDER BY pos;
----------------------
-- Projection
----------------------
CREATE OR REPLACE TEMP TABLE cleaned_undecided AS
SELECT
region_code,
circo_code,
region,
circo,
nuance,
candidat,
voix,
inscrits,
exprimes,
CASE WHEN qualif = 'NON' THEN FALSE ELSE TRUE END AS qualified,
code,
nom,
orientation,
orientation_num,
ROW_NUMBER() OVER (PARTITION BY region, circo ORDER BY exprimes DESC) AS pos
FROM undecided AS r
JOIN nuances AS n
ON r.nuance = n.code;
CREATE OR REPLACE TEMP TABLE report_voix_duel AS
SELECT
*
FROM read_csv_auto('report_voix_duel.csv');
CREATE OR REPLACE TEMP TABLE tour_2 AS
WITH duels AS (
SELECT circo_code, list(orientation ORDER BY orientation) AS orientations
FROM cleaned_undecided
WHERE qualified = TRUE
AND pos <= 2
GROUP BY circo_code
)
SELECT
cu.*,
CASE WHEN orientation = 'Extrême Gauche' THEN 'Gauche' ELSE orientation END AS bucket_orientation,
CASE WHEN orientation_num = -2 THEN -1 ELSE orientation_num END AS bucket_orientation_num,
d.orientations[1] AS orientation_candidat_1,
d.orientations[2] AS orientation_candidat_2
FROM cleaned_undecided AS cu
JOIN duels AS d
ON d.circo_code = cu.circo_code;
-----
-- Circonscriptions dans lesquelles il y a 2 candidats de même orientation
SELECT region, circo, orientation, string_agg(nuance, ', ' ORDER BY exprimes DESC) AS nuances
FROM cleaned_undecided
WHERE qualified = TRUE
GROUP BY region, circo, orientation
HAVING count(*) >= 2
ORDER BY orientation, region, circo
-- Duels en cas de désistement automatique
WITH duels_circo AS (
SELECT DISTINCT circo_code, orientation_candidat_1, orientation_candidat_2
FROM tour_2
)
SELECT orientation_candidat_1, orientation_candidat_2, count(*) AS n
FROM duels_circo
GROUP BY orientation_candidat_1, orientation_candidat_2
ORDER BY n DESC;
-- Après report de voix
WITH report AS (
SELECT dc.region, dc.circo, rvd."to" AS orientation, dc.exprimes * rvd.quantity AS projection
FROM tour_2 AS dc
JOIN report_voix_duel AS rvd
ON (
dc.orientation_candidat_1 = rvd.candidat_1 AND dc.orientation_candidat_2 = rvd.candidat_2
OR dc.orientation_candidat_1 = rvd.candidat_2 AND dc.orientation_candidat_2 = rvd.candidat_1
) AND rvd."from" = dc.orientation
ORDER BY circo_code
), agg_projection AS (
SELECT region, circo, orientation, sum(projection) AS projection
FROM report
GROUP BY region, circo, orientation
ORDER BY region, circo, orientation
), ordered_projection AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY region, circo ORDER BY projection DESC) AS pos
FROM agg_projection
)
SELECT orientation, count(*) AS n
FROM ordered_projection
WHERE pos = 1
GROUP BY orientation
ORDER BY n DESC;