diff --git a/Chapter_13/Chapter_13.sql b/Chapter_13/Chapter_13.sql index 289288e..94ab83a 100644 --- a/Chapter_13/Chapter_13.sql +++ b/Chapter_13/Chapter_13.sql @@ -1,37 +1,34 @@ --- FIRST EDITION FILE; IGNORE - - --------------------------------------------------------------- --- Practical SQL: A Beginner's Guide to Storytelling with Data +--------------------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition -- by Anthony DeBarros -- Chapter 13 Code Examples --------------------------------------------------------------- +---------------------------------------------------------------------------- -- Listing 13-1: Using a subquery in a WHERE clause -SELECT geo_name, - state_us_abbreviation, - p0010001 -FROM us_counties_2010 -WHERE p0010001 >= ( - SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001) - FROM us_counties_2010 +SELECT county_name, + state_name, + pop_est_2019 +FROM us_counties_pop_est_2019 +WHERE pop_est_2019 >= ( + SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY pop_est_2019) + FROM us_counties_pop_est_2019 ) -ORDER BY p0010001 DESC; +ORDER BY pop_est_2019 DESC; -- Listing 13-2: Using a subquery in a WHERE clause for DELETE -CREATE TABLE us_counties_2010_top10 AS -SELECT * FROM us_counties_2010; +CREATE TABLE us_counties_2019_top10 AS +SELECT * FROM us_counties_pop_est_2019; -DELETE FROM us_counties_2010_top10 -WHERE p0010001 < ( - SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001) - FROM us_counties_2010_top10 +DELETE FROM us_counties_2019_top10 +WHERE pop_est_2019 < ( + SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY pop_est_2019) + FROM us_counties_2019_top10 ); -SELECT count(*) FROM us_counties_2010_top10; +SELECT count(*) FROM us_counties_2019_top10; -- Listing 13-3: Subquery as a derived table in a FROM clause @@ -39,97 +36,83 @@ SELECT round(calcs.average, 0) as average, calcs.median, round(calcs.average - calcs.median, 0) AS median_average_diff FROM ( - SELECT avg(p0010001) AS average, + SELECT avg(pop_est_2019) AS average, percentile_cont(.5) - WITHIN GROUP (ORDER BY p0010001)::numeric(10,1) AS median - FROM us_counties_2010 + WITHIN GROUP (ORDER BY pop_est_2019)::numeric AS median + FROM us_counties_pop_est_2019 ) AS calcs; -- Listing 13-4: Joining two derived tables -SELECT census.state_us_abbreviation AS st, - census.st_population, - plants.plant_count, - round((plants.plant_count/census.st_population::numeric(10,1)) * 1000000, 1) - AS plants_per_million +SELECT census.state_name AS st, + census.pop_est_2018, + est.establishment_count, + round((est.establishment_count/census.pop_est_2018::numeric) * 1000, 1) + AS estabs_per_thousand FROM ( SELECT st, - count(*) AS plant_count - FROM meat_poultry_egg_inspect + sum(establishments) AS establishment_count + FROM cbp_naics_72_establishments GROUP BY st ) - AS plants + AS est JOIN ( - SELECT state_us_abbreviation, - sum(p0010001) AS st_population - FROM us_counties_2010 - GROUP BY state_us_abbreviation + SELECT state_name, + sum(pop_est_2018) AS pop_est_2018 + FROM us_counties_pop_est_2019 + GROUP BY state_name ) AS census -ON plants.st = census.state_us_abbreviation -ORDER BY plants_per_million DESC; +ON est.st = census.state_name +ORDER BY estabs_per_thousand DESC; -- Listing 13-5: Adding a subquery to a column list -SELECT geo_name, - state_us_abbreviation AS st, - p0010001 AS total_pop, - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) - FROM us_counties_2010) AS us_median -FROM us_counties_2010; +SELECT county_name, + state_name AS st, + pop_est_2019, + (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019) + FROM us_counties_pop_est_2019) AS us_median +FROM us_counties_pop_est_2019; --- Listing 13-6: Using a subquery expression in a calculation +-- Listing 13-6: Using a subquery in a calculation -SELECT geo_name, - state_us_abbreviation AS st, - p0010001 AS total_pop, - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) - FROM us_counties_2010) AS us_median, - p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) - FROM us_counties_2010) AS diff_from_median -FROM us_counties_2010 -WHERE (p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) - FROM us_counties_2010)) +SELECT county_name, + state_name AS st, + pop_est_2019, + pop_est_2019 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019) + FROM us_counties_pop_est_2019) AS diff_from_median +FROM us_counties_pop_est_2019 +WHERE (pop_est_2019 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019) + FROM us_counties_pop_est_2019)) BETWEEN -1000 AND 1000; - --- BONUS: Subquery expressions --- If you'd like to try the IN, EXISTS, and NOT EXISTS expressions on pages 199-200, --- here's the code to create a retirees table. The queries below are similar --- to the hypothetical examples on pages 199 and 200. You will need the --- employees table you created in Chapter 6. - --- Create table and insert data +-- Listing 13-7: Creating and filling a retirees table + CREATE TABLE retirees ( id int, - first_name varchar(50), - last_name varchar(50) + first_name text, + last_name text ); -INSERT INTO retirees -VALUES (2, 'Lee', 'Smith'), - (4, 'Janet', 'King'); +INSERT INTO retirees +VALUES (2, 'Janet', 'King'), + (4, 'Michael', 'Taylor'); + + +-- Listing 13-8: Generating values for the IN operator --- Generating values for the IN operator SELECT first_name, last_name FROM employees WHERE emp_id IN ( SELECT id FROM retirees); + +-- Listing 13-9: Using a correlated subquery with WHERE EXISTS --- Checking whether values exist (returns all rows from employees --- if the expression evaluates as true) -SELECT first_name, last_name -FROM employees -WHERE EXISTS ( - SELECT id - FROM retirees); - --- Using a correlated subquery to find matching values from employees --- in retirees. SELECT first_name, last_name FROM employees WHERE EXISTS ( @@ -137,65 +120,115 @@ WHERE EXISTS ( FROM retirees WHERE id = employees.emp_id); - - --- Listing 13-7: Using a simple CTE to find large counties +-- Listing 13-10: Using a correlated subquery with WHERE NOT EXISTS -WITH - large_counties (geo_name, st, p0010001) -AS - ( - SELECT geo_name, state_us_abbreviation, p0010001 - FROM us_counties_2010 - WHERE p0010001 >= 100000 - ) -SELECT st, count(*) +SELECT first_name, last_name +FROM employees +WHERE NOT EXISTS ( + SELECT id + FROM retirees + WHERE id = employees.emp_id); + +-- Listing 13-11: Using LATERAL subqueries in the FROM clause + +SELECT county_name, + state_name, + pop_est_2018, + pop_est_2019, + raw_chg, + round(pct_chg * 100, 2) AS pct_chg +FROM us_counties_pop_est_2019, + LATERAL (SELECT pop_est_2019 - pop_est_2018 AS raw_chg) rc, + LATERAL (SELECT raw_chg / pop_est_2018::numeric AS pct_chg) pc +ORDER BY pct_chg DESC; + +-- Listing 13-12: Using a subquery with a LATERAL join + +ALTER TABLE teachers ADD CONSTRAINT id_key PRIMARY KEY (id); + +CREATE TABLE teachers_lab_access ( + access_id bigint PRIMARY KEY GENERATED ALWAYS AS IDENTITY, + access_time timestamp with time zone, + lab_name text, + teacher_id bigint REFERENCES teachers (id) +); + +INSERT INTO teachers_lab_access (access_time, lab_name, teacher_id) +VALUES ('2022-11-30 08:59:00-05', 'Science A', 2), + ('2022-12-01 08:58:00-05', 'Chemistry B', 2), + ('2022-12-21 09:01:00-05', 'Chemistry A', 2), + ('2022-12-02 11:01:00-05', 'Science B', 6), + ('2022-12-07 10:02:00-05', 'Science A', 6), + ('2022-12-17 16:00:00-05', 'Science B', 6); + +SELECT t.first_name, t.last_name, a.access_time, a.lab_name +FROM teachers t +LEFT JOIN LATERAL (SELECT * + FROM teachers_lab_access + WHERE teacher_id = t.id + ORDER BY access_time DESC + LIMIT 2) a +ON true; + +-- Common Table Expressions + +-- Listing 13-13: Using a simple CTE to count large counties + +WITH large_counties (county_name, state_name, pop_est_2019) +AS ( + SELECT county_name, state_name, pop_est_2019 + FROM us_counties_pop_est_2019 + WHERE pop_est_2019 >= 100000 + ) +SELECT state_name, count(*) FROM large_counties -GROUP BY st +GROUP BY state_name ORDER BY count(*) DESC; -- Bonus: You can also write this query as: -SELECT state_us_abbreviation, count(*) -FROM us_counties_2010 -WHERE p0010001 >= 100000 -GROUP BY state_us_abbreviation +SELECT state_name, count(*) +FROM us_counties_pop_est_2019 +WHERE pop_est_2019 >= 100000 +GROUP BY state_name ORDER BY count(*) DESC; --- Listing 13-8: Using CTEs in a table join +-- Listing 13-14: Using CTEs in a table join WITH - counties (st, population) AS - (SELECT state_us_abbreviation, sum(population_count_100_percent) - FROM us_counties_2010 - GROUP BY state_us_abbreviation), + counties (st, pop_est_2018) AS + (SELECT state_name, sum(pop_est_2018) + FROM us_counties_pop_est_2019 + GROUP BY state_name), - plants (st, plants) AS - (SELECT st, count(*) AS plants - FROM meat_poultry_egg_inspect + establishments (st, establishment_count) AS + (SELECT st, sum(establishments) AS establishment_count + FROM cbp_naics_72_establishments GROUP BY st) SELECT counties.st, - population, - plants, - round((plants/population::numeric(10,1))*1000000, 1) AS per_million -FROM counties JOIN plants -ON counties.st = plants.st -ORDER BY per_million DESC; + pop_est_2018, + establishment_count, + round((establishments.establishment_count / + counties.pop_est_2018::numeric(10,1)) * 1000, 1) + AS estabs_per_thousand +FROM counties JOIN establishments +ON counties.st = establishments.st +ORDER BY estabs_per_thousand DESC; --- Listing 13-9: Using CTEs to minimize redundant code +-- Listing 13-15: Using CTEs to minimize redundant code -WITH us_median AS - (SELECT percentile_cont(.5) - WITHIN GROUP (ORDER BY p0010001) AS us_median_pop - FROM us_counties_2010) +WITH us_median AS + (SELECT percentile_cont(.5) + WITHIN GROUP (ORDER BY pop_est_2019) AS us_median_pop + FROM us_counties_pop_est_2019) -SELECT geo_name, - state_us_abbreviation AS st, - p0010001 AS total_pop, +SELECT county_name, + state_name AS st, + pop_est_2019, us_median_pop, - p0010001 - us_median_pop AS diff_from_median -FROM us_counties_2010 CROSS JOIN us_median -WHERE (p0010001 - us_median_pop) + pop_est_2019 - us_median_pop AS diff_from_median +FROM us_counties_pop_est_2019 CROSS JOIN us_median +WHERE (pop_est_2019 - us_median_pop) BETWEEN -1000 AND 1000; @@ -204,19 +237,24 @@ WHERE (p0010001 - us_median_pop) CREATE EXTENSION tablefunc; --- Listing 13-10: Creating and filling the ice_cream_survey table +-- Listing 13-16: Creating and filling the ice_cream_survey table CREATE TABLE ice_cream_survey ( response_id integer PRIMARY KEY, - office varchar(20), - flavor varchar(20) + office text, + flavor text ); COPY ice_cream_survey FROM 'C:\YourDirectory\ice_cream_survey.csv' WITH (FORMAT CSV, HEADER); --- Listing 13-11: Generating the ice cream survey crosstab +-- view the data +SELECT * +FROM ice_cream_survey +LIMIT 5; + +-- Listing 13-17: Generating the ice cream survey crosstab SELECT * FROM crosstab('SELECT office, @@ -231,27 +269,26 @@ FROM crosstab('SELECT office, GROUP BY flavor ORDER BY flavor') -AS (office varchar(20), +AS (office text, chocolate bigint, strawberry bigint, vanilla bigint); --- Listing 13-12: Creating and filling a temperature_readings table +-- Listing 13-18: Creating and filling a temperature_readings table CREATE TABLE temperature_readings ( - reading_id bigserial PRIMARY KEY, - station_name varchar(50), + station_name text, observation_date date, max_temp integer, - min_temp integer + min_temp integer, + CONSTRAINT temp_key PRIMARY KEY (station_name, observation_date) ); -COPY temperature_readings - (station_name, observation_date, max_temp, min_temp) +COPY temperature_readings FROM 'C:\YourDirectory\temperature_readings.csv' WITH (FORMAT CSV, HEADER); --- Listing 13-13: Generating the temperature readings crosstab +-- Listing 13-19: Generating the temperature readings crosstab SELECT * FROM crosstab('SELECT @@ -267,7 +304,7 @@ FROM crosstab('SELECT 'SELECT month FROM generate_series(1,12) month') -AS (station varchar(50), +AS (station text, jan numeric(3,0), feb numeric(3,0), mar numeric(3,0), @@ -282,28 +319,30 @@ AS (station varchar(50), dec numeric(3,0) ); --- Listing 13-14: Re-classifying temperature data with CASE +-- Listing 13-20: Re-classifying temperature data with CASE SELECT max_temp, CASE WHEN max_temp >= 90 THEN 'Hot' - WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm' - WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant' - WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold' - WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing' - ELSE 'Inhumane' + WHEN max_temp >= 70 AND max_temp < 90 THEN 'Warm' + WHEN max_temp >= 50 AND max_temp < 70 THEN 'Pleasant' + WHEN max_temp >= 33 AND max_temp < 50 THEN 'Cold' + WHEN max_temp >= 20 AND max_temp < 33 THEN 'Frigid' + WHEN max_temp < 20 THEN 'Inhumane' + ELSE 'No reading' END AS temperature_group FROM temperature_readings; --- Listing 13-15: Using CASE in a Common Table Expression +-- Listing 13-21: Using CASE in a Common Table Expression WITH temps_collapsed (station_name, max_temperature_group) AS (SELECT station_name, CASE WHEN max_temp >= 90 THEN 'Hot' - WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm' - WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant' - WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold' - WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing' - ELSE 'Inhumane' + WHEN max_temp >= 70 AND max_temp < 90 THEN 'Warm' + WHEN max_temp >= 50 AND max_temp < 70 THEN 'Pleasant' + WHEN max_temp >= 33 AND max_temp < 50 THEN 'Cold' + WHEN max_temp >= 20 AND max_temp < 33 THEN 'Frigid' + WHEN max_temp < 20 THEN 'Inhumane' + ELSE 'No reading' END FROM temperature_readings) @@ -311,3 +350,19 @@ SELECT station_name, max_temperature_group, count(*) FROM temps_collapsed GROUP BY station_name, max_temperature_group ORDER BY station_name, count(*) DESC; + + + +-- UNUSED +SELECT county_name, + state_name, + pop_est_2019, + natural_chg, + net_migration, + natural_chg + net_migration AS total_chg +FROM us_counties_pop_est_2019, + LATERAL (SELECT births_2019 - deaths_2019 AS natural_chg) AS nc, + LATERAL (SELECT international_migr_2019 + domestic_migr_2019 AS net_migration) AS nm +WHERE (net_migration > 0 AND natural_chg < 0) + AND (abs(net_migration) > abs(natural_chg)) +ORDER BY total_chg DESC; diff --git a/Try_It_Yourself/Try_It_Yourself.sql b/Try_It_Yourself/Try_It_Yourself.sql index 861e410..fa6074a 100644 --- a/Try_It_Yourself/Try_It_Yourself.sql +++ b/Try_It_Yourself/Try_It_Yourself.sql @@ -796,3 +796,74 @@ WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval; -- Note: Both correlations are strong, with r values of 0.80 or higher. We'd -- expect this given the cost of a taxi ride is based on both time and distance. + +---------------------------------------------------------------------------- +-- Chapter 13: Advanced Query Techniques +---------------------------------------------------------------------------- + +-- 1. Revise the code in Listing 13-21 to dig deeper into the nuances of +-- Waikiki’s high temperatures. Limit the temps_collapsed table to the Waikiki +-- maximum daily temperature observations. Then use the WHEN clauses in the +-- CASE statement to reclassify the temperatures into seven groups that would +-- result in the following text output: + +-- '90 or more' +-- '88-89' +-- '86-87' +-- '84-85' +-- '82-83' +-- '80-81' +-- '79 or less' + +-- In which of those groups does Waikiki’s daily maximum temperature fall most +-- often? + +-- Answer: Between 86 and 87 degrees. Nice. + +WITH temps_collapsed (station_name, max_temperature_group) AS + (SELECT station_name, + CASE WHEN max_temp >= 90 THEN '90 or more' + WHEN max_temp >= 88 AND max_temp < 90 THEN '88-89' + WHEN max_temp >= 86 AND max_temp < 88 THEN '86-87' + WHEN max_temp >= 84 AND max_temp < 86 THEN '84-85' + WHEN max_temp >= 82 AND max_temp < 84 THEN '82-83' + WHEN max_temp >= 80 AND max_temp < 82 THEN '80-81' + WHEN max_temp < 80 THEN '79 or less' + END + FROM temperature_readings + WHERE station_name = 'WAIKIKI 717.2 HI US') + +SELECT station_name, max_temperature_group, count(*) +FROM temps_collapsed +GROUP BY station_name, max_temperature_group +ORDER BY max_temperature_group; + +-- 2. Revise the ice cream survey crosstab in Listing 13-17 to flip the table. +-- In other words, make flavor the rows and office the columns. Which elements +-- of the query do you need to change? Are the counts different? + +-- Answer: You need to re-order the columns in the first subquery so flavor is +-- first and office is second. count(*) stays third. Then, you must change +-- the second subquery to produce a grouped list of office. Finally, you must +-- add the office names to the output list. + +-- The numbers don't change, just the order presented in the crosstab. + +SELECT * +FROM crosstab('SELECT flavor, + office, + count(*) + FROM ice_cream_survey + GROUP BY flavor, office + ORDER BY flavor', + + 'SELECT office + FROM ice_cream_survey + GROUP BY office + ORDER BY office') + +AS (flavor text, + downtown bigint, + midtown bigint, + uptown bigint); +