analyse_legislatives_2024/analyse-duckdb.sql

349 lines
8.9 KiB
SQL

-------------------------
-- Prep
-------------------------
SET file_search_path = '/home/daifukusan/Documents/analysis/legislatives2024/res/';
CREATE OR REPLACE TEMP TABLE nuances AS
SELECT
*,
CASE
WHEN orientation = 'Extrême Gauche' THEN -2
WHEN orientation = 'Gauche' THEN -1
WHEN orientation = 'Centre' THEN 0
WHEN orientation = 'Droite' THEN 1
WHEN orientation = 'Extrême Droite' THEN 2
END AS 'orientation_num'
FROM read_csv_auto('nuances.csv');
CREATE OR REPLACE TEMP TABLE raw AS
SELECT
regexp_extract(region, '\((.+)\)', 1) AS region_code,
region,
regexp_extract(circo, '\((.+)\)', 1) AS circo_code,
circo,
trim(results->'$[0]', '"') AS nuance,
trim(results->'$[1]', '"') AS candidat,
CAST(results->'$[2]' AS BIGINT) AS voix,
CAST(results->'$[3]' AS DECIMAL) AS inscrits,
CAST(results->'$[4]' AS DECIMAL) AS exprimes,
trim(results->'$[5]', '"') AS qualif,
FROM (
SELECT region, circo, unnest(results) AS results
FROM read_json_auto('legislatives_2024_fr_results.json')
) AS t;
CREATE OR REPLACE TEMP TABLE undecided AS
WITH with_qualif AS (
SELECT DISTINCT region, circo
FROM raw
WHERE qualif = 'QUALIF T2'
)
SELECT r.* FROM raw AS r
JOIN with_qualif AS u
ON u.region = r.region
AND u.circo = r.circo;
-------------------
-- Analyse
-------------------
-- Configurations de second tours par ordre de voix par qualifiés
WITH versus AS (
SELECT region, circo, string_agg(nuance, ', ' ORDER BY exprimes) AS vs
FROM raw
WHERE qualif = 'QUALIF T2'
GROUP BY region, circo
ORDER BY region, circo
)
SELECT
count(*) AS "nombre",
vs AS "configuration" FROM versus
GROUP BY vs
ORDER BY "nombre" DESC;
-- Nombre de 2n tours
SELECT
CASE WHEN qualif = 'OUI' THEN 'NON' ELSE 'OUI' END AS '2nd tour ?',
count(*) as 'nombre'
FROM (SELECT DISTINCT region, circo, qualif FROM raw WHERE qualif in ('QUALIF T2', 'OUI')) AS t
GROUP BY qualif;
-- Nombre de circonscriptions qui auront un 2nd tour dans lesquelles NFP est arrivé à telle ou telle position au premier tour:
WITH ordered AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY region, circo ORDER BY exprimes DESC) AS pos
FROM raw
WHERE qualif = 'QUALIF T2'
)
SELECT pos AS "Position 1er tour", count(*) AS "Nombre de Circonscription" FROM ordered
WHERE nuance = 'UG'
GROUP BY pos
ORDER BY pos;
----------------------
-- Projection
----------------------
CREATE OR REPLACE TEMP TABLE cleaned_undecided AS
SELECT
region_code,
circo_code,
region,
circo,
nuance,
candidat,
voix,
inscrits,
exprimes,
CASE WHEN qualif = 'NON' THEN FALSE ELSE TRUE END AS qualified,
code,
nom,
orientation,
orientation_num,
ROW_NUMBER() OVER (PARTITION BY region, circo ORDER BY exprimes DESC) AS pos
FROM undecided AS r
JOIN nuances AS n
ON r.nuance = n.code;
CREATE OR REPLACE TEMP TABLE report_voix_duel AS
SELECT
*
FROM read_csv_auto('report_voix_duel.csv');
CREATE OR REPLACE TEMP TABLE tour_2 AS
WITH duels AS (
SELECT circo_code, list(orientation ORDER BY orientation) AS orientations
FROM cleaned_undecided
WHERE qualified = TRUE
AND pos <= 2
GROUP BY circo_code
)
SELECT
cu.*,
CASE WHEN orientation = 'Extrême Gauche' THEN 'Gauche' ELSE orientation END AS bucket_orientation,
CASE WHEN orientation_num = -2 THEN -1 ELSE orientation_num END AS bucket_orientation_num,
d.orientations[1] AS orientation_candidat_1,
d.orientations[2] AS orientation_candidat_2
FROM cleaned_undecided AS cu
JOIN duels AS d
ON d.circo_code = cu.circo_code;
CREATE OR REPLACE TEMP TABLE projection AS
WITH report AS (
SELECT
dc.region,
dc.circo,
rvd."to" AS orientation,
dc.exprimes * rvd.quantity AS projection
FROM tour_2 AS dc
JOIN report_voix_duel AS rvd
ON (
dc.orientation_candidat_1 = rvd.candidat_1 AND dc.orientation_candidat_2 = rvd.candidat_2
OR dc.orientation_candidat_1 = rvd.candidat_2 AND dc.orientation_candidat_2 = rvd.candidat_1
) AND rvd."from" = dc.orientation
ORDER BY circo_code
), agg_projection AS (
SELECT region, circo, orientation, SUM(projection) AS projection
FROM report
GROUP BY region, circo, orientation
ORDER BY region, circo, orientation
)
SELECT
region,
circo,
orientation,
projection / SUM(projection) OVER (PARTITION BY region, circo) AS projection
FROM agg_projection
CREATE OR REPLACE TEMP TABLE projection_voix AS
SELECT
dc.region,
dc.circo,
rvd."to" AS orientation,
SUM(dc.voix * rvd.quantity) AS projection
FROM tour_2 AS dc
JOIN report_voix_duel AS rvd
ON (
dc.orientation_candidat_1 = rvd.candidat_1 AND dc.orientation_candidat_2 = rvd.candidat_2
OR dc.orientation_candidat_1 = rvd.candidat_2 AND dc.orientation_candidat_2 = rvd.candidat_1
) AND rvd."from" = dc.orientation
GROUP BY
dc.region,
dc.circo,
rvd."to"
-----
-- Circonscriptions dans lesquelles il y a 2 candidats de même orientation
SELECT region, circo, orientation, string_agg(nuance, ', ' ORDER BY exprimes DESC) AS nuances
FROM cleaned_undecided
WHERE qualified = TRUE
GROUP BY region, circo, orientation
HAVING count(*) >= 2
ORDER BY orientation, region, circo
-- Duels en cas de désistement automatique
WITH duels_circo AS (
SELECT DISTINCT circo_code, orientation_candidat_1, orientation_candidat_2
FROM tour_2
)
SELECT orientation_candidat_1, orientation_candidat_2, count(*) AS n
FROM duels_circo
GROUP BY orientation_candidat_1, orientation_candidat_2
ORDER BY n DESC;
-- Après report de voix
WITH ordered_projection AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY region, circo ORDER BY projection DESC) AS pos
FROM projection
)
SELECT orientation, count(*) AS n
FROM ordered_projection
WHERE pos = 1
GROUP BY orientation
ORDER BY n DESC;
-- Projection avec faible écart
CREATE OR REPLACE TEMP TABLE difference AS
WITH diff_null AS (
SELECT
p.region,
p.circo,
FIRST(p.orientation) OVER (PARTITION BY p.region, p.circo ORDER BY p.projection DESC) AS elu,
ABS(p.projection - lag(p.projection) OVER (PARTITION BY p.region, p.circo)) AS diff,
ABS(pv.projection - lag(pv.projection) OVER (PARTITION BY p.region, p.circo)) AS diff_voix
FROM projection AS p
JOIN projection_voix AS pv
ON p.region = pv.region
AND p.circo = pv.circo
AND p.orientation = pv.orientation
ORDER BY p.region, p.circo
), duels_circo AS (
SELECT DISTINCT circo_code, region, circo, orientation_candidat_1, orientation_candidat_2
FROM tour_2
)
SELECT dc.*, d.elu, d.diff, d.diff_voix
FROM duels_circo AS dc
JOIN diff_null AS d
ON d.region = dc.region
AND d.circo = dc.circo
WHERE d.diff IS NOT NULL;
-- Nombre de duels avec un écart entre les candidats de 1, 2, 5, 10, 20 et 50 pourcents
WITH bucket_diff_orientation AS (
SELECT
orientation_candidat_1 || ' v ' || orientation_candidat_2 AS duel,
elu AS "projection élu",
SUM(CASE WHEN diff < 0.01 THEN 1 ELSE 0 END) AS "1%",
SUM(CASE WHEN diff < 0.02 THEN 1 ELSE 0 END) AS "2%",
SUM(CASE WHEN diff < 0.05 THEN 1 ELSE 0 END) AS "5%",
SUM(CASE WHEN diff < 0.10 THEN 1 ELSE 0 END) AS "10%",
SUM(CASE WHEN diff < 0.20 THEN 1 ELSE 0 END) AS "20%",
SUM(CASE WHEN diff < 0.50 THEN 1 ELSE 0 END) AS "50%",
COUNT(*) AS total
FROM difference
GROUP BY
orientation_candidat_1,
orientation_candidat_2,
elu
), ordered_bucket_diff_orientation AS (
SELECT
*,
ROW_NUMBER() OVER (ORDER BY duel DESC, "projection élu" DESC) AS r
FROM bucket_diff_orientation
), res AS (
SELECT *
FROM ordered_bucket_diff_orientation
UNION ALL
SELECT
'TOTAL' AS duel,
'' AS "projection élu",
SUM("1%") AS "1%",
SUM("2%") AS "3%",
SUM("5%") AS "5%",
SUM("10%") AS "10%",
SUM("20%") AS "30%",
SUM("50%") AS "50%",
SUM(total) AS total,
0 AS r
FROM ordered_bucket_diff_orientation
)
SELECT * EXCLUDE (r)
FROM res
ORDER BY r DESC;
-- Pour les ecarts de moins de 5%, à combien de voix cela se joue ?
SELECT
orientation_candidat_1 || ' v ' || orientation_candidat_2 AS duel,
elu AS "projection élu",
round(min(diff_voix)) AS "min",
round(max(diff_voix)) AS "max",
round(mean(diff_voix)) AS "moyenne",
round(quantile_disc(diff_voix, 0.25)) AS "1er Quartile",
round(median(diff_voix)) AS "médianne",
round(quantile_disc(diff_voix, 0.75)) AS "3e Quartile",
FROM difference
WHERE diff < 0.05
GROUP BY
orientation_candidat_1,
orientation_candidat_2,
elu
ORDER BY
duel,
"projection élu"
-- Les "swing" circos où la gauche est présente
SELECT *
FROM difference
WHERE orientation_candidat_1 = 'Gauche'
OR orientation_candidat_2 = 'Gauche'
ORDER BY diff
-- Rank des "swing" circos analysés par Averroès
SELECT
circo_code AS 'Code Circo',
region AS 'Région',
circo AS 'Circo',
CASE WHEN diff < 0.01 THEN TRUE ELSE FALSE END AS '<1%',
CASE WHEN diff < 0.02 THEN TRUE ELSE FALSE END AS '<2%',
CASE WHEN diff < 0.05 THEN TRUE ELSE FALSE END AS '<5%',
CASE WHEN elu = 'Gauche' THEN TRUE ELSE FALSE END AS 'Gauche favorite ?'
FROM difference
WHERE circo_code IN (
'0502',
'0703',
'1202',
'1601',
'2103',
'2404',
'3103',
'3704',
'3809',
'6302',
'6303',
'6406',
'6912',
'7701',
'7708',
'8002',
'8401',
'8602',
'8703',
'9404',
'9407',
'0402'
)
ORDER BY circo_code