Chapter 13 work in progress

This commit is contained in:
anthonydb 2020-12-23 14:15:56 -05:00
parent e049ca3a83
commit c9a8e792bf
2 changed files with 272 additions and 146 deletions

View File

@ -1,37 +1,34 @@
-- FIRST EDITION FILE; IGNORE ---------------------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
--------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data
-- by Anthony DeBarros -- by Anthony DeBarros
-- Chapter 13 Code Examples -- Chapter 13 Code Examples
-------------------------------------------------------------- ----------------------------------------------------------------------------
-- Listing 13-1: Using a subquery in a WHERE clause -- Listing 13-1: Using a subquery in a WHERE clause
SELECT geo_name, SELECT county_name,
state_us_abbreviation, state_name,
p0010001 pop_est_2019
FROM us_counties_2010 FROM us_counties_pop_est_2019
WHERE p0010001 >= ( WHERE pop_est_2019 >= (
SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001) SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY pop_est_2019)
FROM us_counties_2010 FROM us_counties_pop_est_2019
) )
ORDER BY p0010001 DESC; ORDER BY pop_est_2019 DESC;
-- Listing 13-2: Using a subquery in a WHERE clause for DELETE -- Listing 13-2: Using a subquery in a WHERE clause for DELETE
CREATE TABLE us_counties_2010_top10 AS CREATE TABLE us_counties_2019_top10 AS
SELECT * FROM us_counties_2010; SELECT * FROM us_counties_pop_est_2019;
DELETE FROM us_counties_2010_top10 DELETE FROM us_counties_2019_top10
WHERE p0010001 < ( WHERE pop_est_2019 < (
SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001) SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY pop_est_2019)
FROM us_counties_2010_top10 FROM us_counties_2019_top10
); );
SELECT count(*) FROM us_counties_2010_top10; SELECT count(*) FROM us_counties_2019_top10;
-- Listing 13-3: Subquery as a derived table in a FROM clause -- Listing 13-3: Subquery as a derived table in a FROM clause
@ -39,97 +36,83 @@ SELECT round(calcs.average, 0) as average,
calcs.median, calcs.median,
round(calcs.average - calcs.median, 0) AS median_average_diff round(calcs.average - calcs.median, 0) AS median_average_diff
FROM ( FROM (
SELECT avg(p0010001) AS average, SELECT avg(pop_est_2019) AS average,
percentile_cont(.5) percentile_cont(.5)
WITHIN GROUP (ORDER BY p0010001)::numeric(10,1) AS median WITHIN GROUP (ORDER BY pop_est_2019)::numeric AS median
FROM us_counties_2010 FROM us_counties_pop_est_2019
) )
AS calcs; AS calcs;
-- Listing 13-4: Joining two derived tables -- Listing 13-4: Joining two derived tables
SELECT census.state_us_abbreviation AS st, SELECT census.state_name AS st,
census.st_population, census.pop_est_2018,
plants.plant_count, est.establishment_count,
round((plants.plant_count/census.st_population::numeric(10,1)) * 1000000, 1) round((est.establishment_count/census.pop_est_2018::numeric) * 1000, 1)
AS plants_per_million AS estabs_per_thousand
FROM FROM
( (
SELECT st, SELECT st,
count(*) AS plant_count sum(establishments) AS establishment_count
FROM meat_poultry_egg_inspect FROM cbp_naics_72_establishments
GROUP BY st GROUP BY st
) )
AS plants AS est
JOIN JOIN
( (
SELECT state_us_abbreviation, SELECT state_name,
sum(p0010001) AS st_population sum(pop_est_2018) AS pop_est_2018
FROM us_counties_2010 FROM us_counties_pop_est_2019
GROUP BY state_us_abbreviation GROUP BY state_name
) )
AS census AS census
ON plants.st = census.state_us_abbreviation ON est.st = census.state_name
ORDER BY plants_per_million DESC; ORDER BY estabs_per_thousand DESC;
-- Listing 13-5: Adding a subquery to a column list -- Listing 13-5: Adding a subquery to a column list
SELECT geo_name, SELECT county_name,
state_us_abbreviation AS st, state_name AS st,
p0010001 AS total_pop, pop_est_2019,
(SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019)
FROM us_counties_2010) AS us_median FROM us_counties_pop_est_2019) AS us_median
FROM us_counties_2010; FROM us_counties_pop_est_2019;
-- Listing 13-6: Using a subquery expression in a calculation -- Listing 13-6: Using a subquery in a calculation
SELECT geo_name, SELECT county_name,
state_us_abbreviation AS st, state_name AS st,
p0010001 AS total_pop, pop_est_2019,
(SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) pop_est_2019 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019)
FROM us_counties_2010) AS us_median, FROM us_counties_pop_est_2019) AS diff_from_median
p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) FROM us_counties_pop_est_2019
FROM us_counties_2010) AS diff_from_median WHERE (pop_est_2019 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019)
FROM us_counties_2010 FROM us_counties_pop_est_2019))
WHERE (p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
FROM us_counties_2010))
BETWEEN -1000 AND 1000; BETWEEN -1000 AND 1000;
-- Listing 13-7: Creating and filling a retirees table
-- BONUS: Subquery expressions
-- If you'd like to try the IN, EXISTS, and NOT EXISTS expressions on pages 199-200,
-- here's the code to create a retirees table. The queries below are similar
-- to the hypothetical examples on pages 199 and 200. You will need the
-- employees table you created in Chapter 6.
-- Create table and insert data
CREATE TABLE retirees ( CREATE TABLE retirees (
id int, id int,
first_name varchar(50), first_name text,
last_name varchar(50) last_name text
); );
INSERT INTO retirees INSERT INTO retirees
VALUES (2, 'Lee', 'Smith'), VALUES (2, 'Janet', 'King'),
(4, 'Janet', 'King'); (4, 'Michael', 'Taylor');
-- Listing 13-8: Generating values for the IN operator
-- Generating values for the IN operator
SELECT first_name, last_name SELECT first_name, last_name
FROM employees FROM employees
WHERE emp_id IN ( WHERE emp_id IN (
SELECT id SELECT id
FROM retirees); FROM retirees);
-- Checking whether values exist (returns all rows from employees -- Listing 13-9: Using a correlated subquery with WHERE EXISTS
-- if the expression evaluates as true)
SELECT first_name, last_name
FROM employees
WHERE EXISTS (
SELECT id
FROM retirees);
-- Using a correlated subquery to find matching values from employees
-- in retirees.
SELECT first_name, last_name SELECT first_name, last_name
FROM employees FROM employees
WHERE EXISTS ( WHERE EXISTS (
@ -137,65 +120,115 @@ WHERE EXISTS (
FROM retirees FROM retirees
WHERE id = employees.emp_id); WHERE id = employees.emp_id);
-- Listing 13-10: Using a correlated subquery with WHERE NOT EXISTS
SELECT first_name, last_name
FROM employees
WHERE NOT EXISTS (
SELECT id
FROM retirees
WHERE id = employees.emp_id);
-- Listing 13-7: Using a simple CTE to find large counties -- Listing 13-11: Using LATERAL subqueries in the FROM clause
WITH SELECT county_name,
large_counties (geo_name, st, p0010001) state_name,
AS pop_est_2018,
( pop_est_2019,
SELECT geo_name, state_us_abbreviation, p0010001 raw_chg,
FROM us_counties_2010 round(pct_chg * 100, 2) AS pct_chg
WHERE p0010001 >= 100000 FROM us_counties_pop_est_2019,
LATERAL (SELECT pop_est_2019 - pop_est_2018 AS raw_chg) rc,
LATERAL (SELECT raw_chg / pop_est_2018::numeric AS pct_chg) pc
ORDER BY pct_chg DESC;
-- Listing 13-12: Using a subquery with a LATERAL join
ALTER TABLE teachers ADD CONSTRAINT id_key PRIMARY KEY (id);
CREATE TABLE teachers_lab_access (
access_id bigint PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
access_time timestamp with time zone,
lab_name text,
teacher_id bigint REFERENCES teachers (id)
);
INSERT INTO teachers_lab_access (access_time, lab_name, teacher_id)
VALUES ('2022-11-30 08:59:00-05', 'Science A', 2),
('2022-12-01 08:58:00-05', 'Chemistry B', 2),
('2022-12-21 09:01:00-05', 'Chemistry A', 2),
('2022-12-02 11:01:00-05', 'Science B', 6),
('2022-12-07 10:02:00-05', 'Science A', 6),
('2022-12-17 16:00:00-05', 'Science B', 6);
SELECT t.first_name, t.last_name, a.access_time, a.lab_name
FROM teachers t
LEFT JOIN LATERAL (SELECT *
FROM teachers_lab_access
WHERE teacher_id = t.id
ORDER BY access_time DESC
LIMIT 2) a
ON true;
-- Common Table Expressions
-- Listing 13-13: Using a simple CTE to count large counties
WITH large_counties (county_name, state_name, pop_est_2019)
AS (
SELECT county_name, state_name, pop_est_2019
FROM us_counties_pop_est_2019
WHERE pop_est_2019 >= 100000
) )
SELECT st, count(*) SELECT state_name, count(*)
FROM large_counties FROM large_counties
GROUP BY st GROUP BY state_name
ORDER BY count(*) DESC; ORDER BY count(*) DESC;
-- Bonus: You can also write this query as: -- Bonus: You can also write this query as:
SELECT state_us_abbreviation, count(*) SELECT state_name, count(*)
FROM us_counties_2010 FROM us_counties_pop_est_2019
WHERE p0010001 >= 100000 WHERE pop_est_2019 >= 100000
GROUP BY state_us_abbreviation GROUP BY state_name
ORDER BY count(*) DESC; ORDER BY count(*) DESC;
-- Listing 13-8: Using CTEs in a table join -- Listing 13-14: Using CTEs in a table join
WITH WITH
counties (st, population) AS counties (st, pop_est_2018) AS
(SELECT state_us_abbreviation, sum(population_count_100_percent) (SELECT state_name, sum(pop_est_2018)
FROM us_counties_2010 FROM us_counties_pop_est_2019
GROUP BY state_us_abbreviation), GROUP BY state_name),
plants (st, plants) AS establishments (st, establishment_count) AS
(SELECT st, count(*) AS plants (SELECT st, sum(establishments) AS establishment_count
FROM meat_poultry_egg_inspect FROM cbp_naics_72_establishments
GROUP BY st) GROUP BY st)
SELECT counties.st, SELECT counties.st,
population, pop_est_2018,
plants, establishment_count,
round((plants/population::numeric(10,1))*1000000, 1) AS per_million round((establishments.establishment_count /
FROM counties JOIN plants counties.pop_est_2018::numeric(10,1)) * 1000, 1)
ON counties.st = plants.st AS estabs_per_thousand
ORDER BY per_million DESC; FROM counties JOIN establishments
ON counties.st = establishments.st
ORDER BY estabs_per_thousand DESC;
-- Listing 13-9: Using CTEs to minimize redundant code -- Listing 13-15: Using CTEs to minimize redundant code
WITH us_median AS WITH us_median AS
(SELECT percentile_cont(.5) (SELECT percentile_cont(.5)
WITHIN GROUP (ORDER BY p0010001) AS us_median_pop WITHIN GROUP (ORDER BY pop_est_2019) AS us_median_pop
FROM us_counties_2010) FROM us_counties_pop_est_2019)
SELECT geo_name, SELECT county_name,
state_us_abbreviation AS st, state_name AS st,
p0010001 AS total_pop, pop_est_2019,
us_median_pop, us_median_pop,
p0010001 - us_median_pop AS diff_from_median pop_est_2019 - us_median_pop AS diff_from_median
FROM us_counties_2010 CROSS JOIN us_median FROM us_counties_pop_est_2019 CROSS JOIN us_median
WHERE (p0010001 - us_median_pop) WHERE (pop_est_2019 - us_median_pop)
BETWEEN -1000 AND 1000; BETWEEN -1000 AND 1000;
@ -204,19 +237,24 @@ WHERE (p0010001 - us_median_pop)
CREATE EXTENSION tablefunc; CREATE EXTENSION tablefunc;
-- Listing 13-10: Creating and filling the ice_cream_survey table -- Listing 13-16: Creating and filling the ice_cream_survey table
CREATE TABLE ice_cream_survey ( CREATE TABLE ice_cream_survey (
response_id integer PRIMARY KEY, response_id integer PRIMARY KEY,
office varchar(20), office text,
flavor varchar(20) flavor text
); );
COPY ice_cream_survey COPY ice_cream_survey
FROM 'C:\YourDirectory\ice_cream_survey.csv' FROM 'C:\YourDirectory\ice_cream_survey.csv'
WITH (FORMAT CSV, HEADER); WITH (FORMAT CSV, HEADER);
-- Listing 13-11: Generating the ice cream survey crosstab -- view the data
SELECT *
FROM ice_cream_survey
LIMIT 5;
-- Listing 13-17: Generating the ice cream survey crosstab
SELECT * SELECT *
FROM crosstab('SELECT office, FROM crosstab('SELECT office,
@ -231,27 +269,26 @@ FROM crosstab('SELECT office,
GROUP BY flavor GROUP BY flavor
ORDER BY flavor') ORDER BY flavor')
AS (office varchar(20), AS (office text,
chocolate bigint, chocolate bigint,
strawberry bigint, strawberry bigint,
vanilla bigint); vanilla bigint);
-- Listing 13-12: Creating and filling a temperature_readings table -- Listing 13-18: Creating and filling a temperature_readings table
CREATE TABLE temperature_readings ( CREATE TABLE temperature_readings (
reading_id bigserial PRIMARY KEY, station_name text,
station_name varchar(50),
observation_date date, observation_date date,
max_temp integer, max_temp integer,
min_temp integer min_temp integer,
CONSTRAINT temp_key PRIMARY KEY (station_name, observation_date)
); );
COPY temperature_readings COPY temperature_readings
(station_name, observation_date, max_temp, min_temp)
FROM 'C:\YourDirectory\temperature_readings.csv' FROM 'C:\YourDirectory\temperature_readings.csv'
WITH (FORMAT CSV, HEADER); WITH (FORMAT CSV, HEADER);
-- Listing 13-13: Generating the temperature readings crosstab -- Listing 13-19: Generating the temperature readings crosstab
SELECT * SELECT *
FROM crosstab('SELECT FROM crosstab('SELECT
@ -267,7 +304,7 @@ FROM crosstab('SELECT
'SELECT month 'SELECT month
FROM generate_series(1,12) month') FROM generate_series(1,12) month')
AS (station varchar(50), AS (station text,
jan numeric(3,0), jan numeric(3,0),
feb numeric(3,0), feb numeric(3,0),
mar numeric(3,0), mar numeric(3,0),
@ -282,28 +319,30 @@ AS (station varchar(50),
dec numeric(3,0) dec numeric(3,0)
); );
-- Listing 13-14: Re-classifying temperature data with CASE -- Listing 13-20: Re-classifying temperature data with CASE
SELECT max_temp, SELECT max_temp,
CASE WHEN max_temp >= 90 THEN 'Hot' CASE WHEN max_temp >= 90 THEN 'Hot'
WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm' WHEN max_temp >= 70 AND max_temp < 90 THEN 'Warm'
WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant' WHEN max_temp >= 50 AND max_temp < 70 THEN 'Pleasant'
WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold' WHEN max_temp >= 33 AND max_temp < 50 THEN 'Cold'
WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing' WHEN max_temp >= 20 AND max_temp < 33 THEN 'Frigid'
ELSE 'Inhumane' WHEN max_temp < 20 THEN 'Inhumane'
ELSE 'No reading'
END AS temperature_group END AS temperature_group
FROM temperature_readings; FROM temperature_readings;
-- Listing 13-15: Using CASE in a Common Table Expression -- Listing 13-21: Using CASE in a Common Table Expression
WITH temps_collapsed (station_name, max_temperature_group) AS WITH temps_collapsed (station_name, max_temperature_group) AS
(SELECT station_name, (SELECT station_name,
CASE WHEN max_temp >= 90 THEN 'Hot' CASE WHEN max_temp >= 90 THEN 'Hot'
WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm' WHEN max_temp >= 70 AND max_temp < 90 THEN 'Warm'
WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant' WHEN max_temp >= 50 AND max_temp < 70 THEN 'Pleasant'
WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold' WHEN max_temp >= 33 AND max_temp < 50 THEN 'Cold'
WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing' WHEN max_temp >= 20 AND max_temp < 33 THEN 'Frigid'
ELSE 'Inhumane' WHEN max_temp < 20 THEN 'Inhumane'
ELSE 'No reading'
END END
FROM temperature_readings) FROM temperature_readings)
@ -311,3 +350,19 @@ SELECT station_name, max_temperature_group, count(*)
FROM temps_collapsed FROM temps_collapsed
GROUP BY station_name, max_temperature_group GROUP BY station_name, max_temperature_group
ORDER BY station_name, count(*) DESC; ORDER BY station_name, count(*) DESC;
-- UNUSED
SELECT county_name,
state_name,
pop_est_2019,
natural_chg,
net_migration,
natural_chg + net_migration AS total_chg
FROM us_counties_pop_est_2019,
LATERAL (SELECT births_2019 - deaths_2019 AS natural_chg) AS nc,
LATERAL (SELECT international_migr_2019 + domestic_migr_2019 AS net_migration) AS nm
WHERE (net_migration > 0 AND natural_chg < 0)
AND (abs(net_migration) > abs(natural_chg))
ORDER BY total_chg DESC;

View File

@ -796,3 +796,74 @@ WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval;
-- Note: Both correlations are strong, with r values of 0.80 or higher. We'd -- Note: Both correlations are strong, with r values of 0.80 or higher. We'd
-- expect this given the cost of a taxi ride is based on both time and distance. -- expect this given the cost of a taxi ride is based on both time and distance.
----------------------------------------------------------------------------
-- Chapter 13: Advanced Query Techniques
----------------------------------------------------------------------------
-- 1. Revise the code in Listing 13-21 to dig deeper into the nuances of
-- Waikikis high temperatures. Limit the temps_collapsed table to the Waikiki
-- maximum daily temperature observations. Then use the WHEN clauses in the
-- CASE statement to reclassify the temperatures into seven groups that would
-- result in the following text output:
-- '90 or more'
-- '88-89'
-- '86-87'
-- '84-85'
-- '82-83'
-- '80-81'
-- '79 or less'
-- In which of those groups does Waikikis daily maximum temperature fall most
-- often?
-- Answer: Between 86 and 87 degrees. Nice.
WITH temps_collapsed (station_name, max_temperature_group) AS
(SELECT station_name,
CASE WHEN max_temp >= 90 THEN '90 or more'
WHEN max_temp >= 88 AND max_temp < 90 THEN '88-89'
WHEN max_temp >= 86 AND max_temp < 88 THEN '86-87'
WHEN max_temp >= 84 AND max_temp < 86 THEN '84-85'
WHEN max_temp >= 82 AND max_temp < 84 THEN '82-83'
WHEN max_temp >= 80 AND max_temp < 82 THEN '80-81'
WHEN max_temp < 80 THEN '79 or less'
END
FROM temperature_readings
WHERE station_name = 'WAIKIKI 717.2 HI US')
SELECT station_name, max_temperature_group, count(*)
FROM temps_collapsed
GROUP BY station_name, max_temperature_group
ORDER BY max_temperature_group;
-- 2. Revise the ice cream survey crosstab in Listing 13-17 to flip the table.
-- In other words, make flavor the rows and office the columns. Which elements
-- of the query do you need to change? Are the counts different?
-- Answer: You need to re-order the columns in the first subquery so flavor is
-- first and office is second. count(*) stays third. Then, you must change
-- the second subquery to produce a grouped list of office. Finally, you must
-- add the office names to the output list.
-- The numbers don't change, just the order presented in the crosstab.
SELECT *
FROM crosstab('SELECT flavor,
office,
count(*)
FROM ice_cream_survey
GROUP BY flavor, office
ORDER BY flavor',
'SELECT office
FROM ice_cream_survey
GROUP BY office
ORDER BY office')
AS (flavor text,
downtown bigint,
midtown bigint,
uptown bigint);