Chapter 13 work in progress

2020-12-23 14:15:56 -05:00 · 2020-12-23 14:15:56 -05:00 · c9a8e792bf
commit c9a8e792bf
parent e049ca3a83
2 changed files with 272 additions and 146 deletions
--- a/Chapter_13/Chapter_13.sql
+++ b/Chapter_13/Chapter_13.sql
@ -1,37 +1,34 @@
-- FIRST EDITION FILE; IGNORE
-
-
--------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data
+---------------------------------------------------------------------------
+-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
 -- by Anthony DeBarros

 -- Chapter 13 Code Examples
--------------------------------------------------------------
+----------------------------------------------------------------------------

 -- Listing 13-1: Using a subquery in a WHERE clause

-SELECT geo_name,
-       state_us_abbreviation,
-       p0010001
-FROM us_counties_2010
-WHERE p0010001 >= (
-    SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001)
-    FROM us_counties_2010
+SELECT county_name,
+       state_name,
+       pop_est_2019
+FROM us_counties_pop_est_2019
+WHERE pop_est_2019 >= (
+    SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY pop_est_2019)
+    FROM us_counties_pop_est_2019
    )
-ORDER BY p0010001 DESC;
+ORDER BY pop_est_2019 DESC;

 -- Listing 13-2: Using a subquery in a WHERE clause for DELETE

-CREATE TABLE us_counties_2010_top10 AS
-SELECT * FROM us_counties_2010;
+CREATE TABLE us_counties_2019_top10 AS
+SELECT * FROM us_counties_pop_est_2019;

-DELETE FROM us_counties_2010_top10
-WHERE p0010001 < (
-    SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001)
-    FROM us_counties_2010_top10
+DELETE FROM us_counties_2019_top10
+WHERE pop_est_2019 < (
+    SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY pop_est_2019)
+    FROM us_counties_2019_top10
    );

-SELECT count(*) FROM us_counties_2010_top10;
+SELECT count(*) FROM us_counties_2019_top10;

 -- Listing 13-3: Subquery as a derived table in a FROM clause

@ -39,97 +36,83 @@ SELECT round(calcs.average, 0) as average,
       calcs.median,
       round(calcs.average - calcs.median, 0) AS median_average_diff
 FROM (
-     SELECT avg(p0010001) AS average,
+     SELECT avg(pop_est_2019) AS average,
            percentile_cont(.5)
-                WITHIN GROUP (ORDER BY p0010001)::numeric(10,1) AS median
-     FROM us_counties_2010
+                WITHIN GROUP (ORDER BY pop_est_2019)::numeric AS median
+     FROM us_counties_pop_est_2019
     )
 AS calcs;

 -- Listing 13-4: Joining two derived tables

-SELECT census.state_us_abbreviation AS st,
-       census.st_population,
-       plants.plant_count,
-       round((plants.plant_count/census.st_population::numeric(10,1)) * 1000000, 1)
-           AS plants_per_million
+SELECT census.state_name AS st,
+       census.pop_est_2018,
+       est.establishment_count,
+       round((est.establishment_count/census.pop_est_2018::numeric) * 1000, 1)
+           AS estabs_per_thousand
 FROM
    (
         SELECT st,
-                count(*) AS plant_count
-         FROM meat_poultry_egg_inspect
+                sum(establishments) AS establishment_count
+         FROM cbp_naics_72_establishments
         GROUP BY st
    )
-    AS plants
+    AS est
 JOIN
    (
-        SELECT state_us_abbreviation,
-               sum(p0010001) AS st_population
-        FROM us_counties_2010
-        GROUP BY state_us_abbreviation
+        SELECT state_name,
+               sum(pop_est_2018) AS pop_est_2018
+        FROM us_counties_pop_est_2019
+        GROUP BY state_name
    )
    AS census
-ON plants.st = census.state_us_abbreviation
-ORDER BY plants_per_million DESC;
+ON est.st = census.state_name
+ORDER BY estabs_per_thousand DESC;

 -- Listing 13-5: Adding a subquery to a column list

-SELECT geo_name,
-       state_us_abbreviation AS st,
-       p0010001 AS total_pop,
-       (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
-        FROM us_counties_2010) AS us_median
-FROM us_counties_2010;
+SELECT county_name,
+       state_name AS st,
+       pop_est_2019,
+       (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019)
+        FROM us_counties_pop_est_2019) AS us_median
+FROM us_counties_pop_est_2019;

-- Listing 13-6: Using a subquery expression in a calculation
+-- Listing 13-6: Using a subquery in a calculation

-SELECT geo_name,
-       state_us_abbreviation AS st,
-       p0010001 AS total_pop,
-       (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
-        FROM us_counties_2010) AS us_median,
-       p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
-                   FROM us_counties_2010) AS diff_from_median
-FROM us_counties_2010
-WHERE (p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
-                   FROM us_counties_2010))
+SELECT county_name,
+       state_name AS st,
+       pop_est_2019,
+       pop_est_2019 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019)
+                       FROM us_counties_pop_est_2019) AS diff_from_median
+FROM us_counties_pop_est_2019
+WHERE (pop_est_2019 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY pop_est_2019)
+                       FROM us_counties_pop_est_2019))
       BETWEEN -1000 AND 1000;

+-- Listing 13-7: Creating and filling a retirees table

-- BONUS: Subquery expressions
-- If you'd like to try the IN, EXISTS, and NOT EXISTS expressions on pages 199-200,
-- here's the code to create a retirees table. The queries below are similar
-- to the hypothetical examples on pages 199 and 200. You will need the
-- employees table you created in Chapter 6.
-              
-- Create table and insert data
 CREATE TABLE retirees (
    id int,
-    first_name varchar(50),
-    last_name varchar(50)
+    first_name text,
+    last_name text
 );

 INSERT INTO retirees
-VALUES (2, 'Lee', 'Smith'),
-       (4, 'Janet', 'King');
+VALUES (2, 'Janet', 'King'),
+       (4, 'Michael', 'Taylor');
+
+
+-- Listing 13-8: Generating values for the IN operator

-- Generating values for the IN operator
 SELECT first_name, last_name
 FROM employees
 WHERE emp_id IN (
    SELECT id
    FROM retirees);
    
-- Checking whether values exist (returns all rows from employees
-- if the expression evaluates as true)
-SELECT first_name, last_name
-FROM employees
-WHERE EXISTS (
-    SELECT id
-    FROM retirees);
+-- Listing 13-9: Using a correlated subquery with WHERE EXISTS

-- Using a correlated subquery to find matching values from employees
-- in retirees.
 SELECT first_name, last_name
 FROM employees
 WHERE EXISTS (
@ -137,65 +120,115 @@ WHERE EXISTS (
    FROM retirees
    WHERE id = employees.emp_id);

+-- Listing 13-10: Using a correlated subquery with WHERE NOT EXISTS

+SELECT first_name, last_name
+FROM employees
+WHERE NOT EXISTS (
+    SELECT id
+    FROM retirees
+    WHERE id = employees.emp_id);

-- Listing 13-7: Using a simple CTE to find large counties
+-- Listing 13-11: Using LATERAL subqueries in the FROM clause

-WITH
-    large_counties (geo_name, st, p0010001)
-AS
-    (
-        SELECT geo_name, state_us_abbreviation, p0010001
-        FROM us_counties_2010
-        WHERE p0010001 >= 100000
+SELECT county_name,
+       state_name,
+       pop_est_2018,
+       pop_est_2019,
+       raw_chg,
+       round(pct_chg * 100, 2) AS pct_chg
+FROM us_counties_pop_est_2019,
+     LATERAL (SELECT pop_est_2019 - pop_est_2018 AS raw_chg) rc,
+     LATERAL (SELECT raw_chg / pop_est_2018::numeric AS pct_chg) pc
+ORDER BY pct_chg DESC;
+
+-- Listing 13-12: Using a subquery with a LATERAL join
+
+ALTER TABLE teachers ADD CONSTRAINT id_key PRIMARY KEY (id);
+
+CREATE TABLE teachers_lab_access (
+    access_id bigint PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
+    access_time timestamp with time zone,
+    lab_name text,
+    teacher_id bigint REFERENCES teachers (id)
+);
+
+INSERT INTO teachers_lab_access (access_time, lab_name, teacher_id)
+VALUES ('2022-11-30 08:59:00-05', 'Science A', 2),
+       ('2022-12-01 08:58:00-05', 'Chemistry B', 2),
+       ('2022-12-21 09:01:00-05', 'Chemistry A', 2),
+       ('2022-12-02 11:01:00-05', 'Science B', 6),
+       ('2022-12-07 10:02:00-05', 'Science A', 6),
+       ('2022-12-17 16:00:00-05', 'Science B', 6);
+
+SELECT t.first_name, t.last_name, a.access_time, a.lab_name
+FROM teachers t
+LEFT JOIN LATERAL (SELECT *
+                   FROM teachers_lab_access
+                   WHERE teacher_id = t.id
+                   ORDER BY access_time DESC
+                   LIMIT 2) a
+ON true;
+
+-- Common Table Expressions
+
+-- Listing 13-13: Using a simple CTE to count large counties
+
+WITH large_counties (county_name, state_name, pop_est_2019)
+AS (
+    SELECT county_name, state_name, pop_est_2019
+    FROM us_counties_pop_est_2019
+    WHERE pop_est_2019 >= 100000
   )
-SELECT st, count(*)
+SELECT state_name, count(*)
 FROM large_counties
-GROUP BY st
+GROUP BY state_name
 ORDER BY count(*) DESC;

 -- Bonus: You can also write this query as:
-SELECT state_us_abbreviation, count(*)
-FROM us_counties_2010
-WHERE p0010001 >= 100000
-GROUP BY state_us_abbreviation
+SELECT state_name, count(*)
+FROM us_counties_pop_est_2019
+WHERE pop_est_2019 >= 100000
+GROUP BY state_name
 ORDER BY count(*) DESC;

-- Listing 13-8: Using CTEs in a table join
+-- Listing 13-14: Using CTEs in a table join

 WITH
-    counties (st, population) AS
-    (SELECT state_us_abbreviation, sum(population_count_100_percent)
-     FROM us_counties_2010
-     GROUP BY state_us_abbreviation),
+    counties (st, pop_est_2018) AS
+    (SELECT state_name, sum(pop_est_2018)
+     FROM us_counties_pop_est_2019
+     GROUP BY state_name),

-    plants (st, plants) AS
-    (SELECT st, count(*) AS plants
-     FROM meat_poultry_egg_inspect
+    establishments (st, establishment_count) AS
+    (SELECT st, sum(establishments) AS establishment_count
+     FROM cbp_naics_72_establishments
     GROUP BY st)

 SELECT counties.st,
-       population,
-       plants,
-       round((plants/population::numeric(10,1))*1000000, 1) AS per_million
-FROM counties JOIN plants
-ON counties.st = plants.st
-ORDER BY per_million DESC;
+       pop_est_2018,
+       establishment_count,
+       round((establishments.establishment_count / 
+              counties.pop_est_2018::numeric(10,1)) * 1000, 1)
+           AS estabs_per_thousand
+FROM counties JOIN establishments
+ON counties.st = establishments.st
+ORDER BY estabs_per_thousand DESC;

-- Listing 13-9: Using CTEs to minimize redundant code
+-- Listing 13-15: Using CTEs to minimize redundant code

 WITH us_median AS
    (SELECT percentile_cont(.5)
-     WITHIN GROUP (ORDER BY p0010001) AS us_median_pop
-     FROM us_counties_2010)
+     WITHIN GROUP (ORDER BY pop_est_2019) AS us_median_pop
+     FROM us_counties_pop_est_2019)

-SELECT geo_name,
-       state_us_abbreviation AS st,
-       p0010001 AS total_pop,
+SELECT county_name,
+       state_name AS st,
+       pop_est_2019,
       us_median_pop,
-       p0010001 - us_median_pop AS diff_from_median 
-FROM us_counties_2010 CROSS JOIN us_median
-WHERE (p0010001 - us_median_pop)
+       pop_est_2019 - us_median_pop AS diff_from_median
+FROM us_counties_pop_est_2019 CROSS JOIN us_median
+WHERE (pop_est_2019 - us_median_pop)
       BETWEEN -1000 AND 1000;


@ -204,19 +237,24 @@ WHERE (p0010001 - us_median_pop)

 CREATE EXTENSION tablefunc;

-- Listing 13-10: Creating and filling the ice_cream_survey table
+-- Listing 13-16: Creating and filling the ice_cream_survey table

 CREATE TABLE ice_cream_survey (
    response_id integer PRIMARY KEY,
-    office varchar(20),
-    flavor varchar(20)
+    office text,
+    flavor text
 );

 COPY ice_cream_survey
 FROM 'C:\YourDirectory\ice_cream_survey.csv'
 WITH (FORMAT CSV, HEADER);

-- Listing 13-11: Generating the ice cream survey crosstab
+-- view the data
+SELECT * 
+FROM ice_cream_survey
+LIMIT 5;
+
+-- Listing 13-17: Generating the ice cream survey crosstab

 SELECT *
 FROM crosstab('SELECT office,
@ -231,27 +269,26 @@ FROM crosstab('SELECT office,
               GROUP BY flavor
               ORDER BY flavor')

-AS (office varchar(20),
+AS (office text,
    chocolate bigint,
    strawberry bigint,
    vanilla bigint);

-- Listing 13-12: Creating and filling a temperature_readings table
+-- Listing 13-18: Creating and filling a temperature_readings table

 CREATE TABLE temperature_readings (
-    reading_id bigserial PRIMARY KEY,
-    station_name varchar(50),
+    station_name text,
    observation_date date,
    max_temp integer,
-    min_temp integer
+    min_temp integer,
+    CONSTRAINT temp_key PRIMARY KEY (station_name, observation_date)
 );

 COPY temperature_readings
-     (station_name, observation_date, max_temp, min_temp)
 FROM 'C:\YourDirectory\temperature_readings.csv'
 WITH (FORMAT CSV, HEADER);

-- Listing 13-13: Generating the temperature readings crosstab
+-- Listing 13-19: Generating the temperature readings crosstab

 SELECT *
 FROM crosstab('SELECT
@ -267,7 +304,7 @@ FROM crosstab('SELECT
              'SELECT month
               FROM generate_series(1,12) month')

-AS (station varchar(50),
+AS (station text,
    jan numeric(3,0),
    feb numeric(3,0),
    mar numeric(3,0),
@ -282,28 +319,30 @@ AS (station varchar(50),
    dec numeric(3,0)
 );

-- Listing 13-14: Re-classifying temperature data with CASE
+-- Listing 13-20: Re-classifying temperature data with CASE

 SELECT max_temp,
       CASE WHEN max_temp >= 90 THEN 'Hot'
-            WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm'
-            WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant'
-            WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold'
-            WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing'
-            ELSE 'Inhumane'
+            WHEN max_temp >= 70 AND max_temp < 90 THEN 'Warm'
+            WHEN max_temp >= 50 AND max_temp < 70 THEN 'Pleasant'
+            WHEN max_temp >= 33 AND max_temp < 50 THEN 'Cold'
+            WHEN max_temp >= 20 AND max_temp < 33 THEN 'Frigid'
+            WHEN max_temp < 20 THEN 'Inhumane'
+            ELSE 'No reading'
        END AS temperature_group
 FROM temperature_readings;

-- Listing 13-15: Using CASE in a Common Table Expression
+-- Listing 13-21: Using CASE in a Common Table Expression

 WITH temps_collapsed (station_name, max_temperature_group) AS
    (SELECT station_name,
           CASE WHEN max_temp >= 90 THEN 'Hot'
-                WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm'
-                WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant'
-                WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold'
-                WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing'
-                ELSE 'Inhumane'
+                WHEN max_temp >= 70 AND max_temp < 90 THEN 'Warm'
+                WHEN max_temp >= 50 AND max_temp < 70 THEN 'Pleasant'
+                WHEN max_temp >= 33 AND max_temp < 50 THEN 'Cold'
+                WHEN max_temp >= 20 AND max_temp < 33 THEN 'Frigid'
+                WHEN max_temp < 20 THEN 'Inhumane'
+                ELSE 'No reading'
            END
    FROM temperature_readings)

@ -311,3 +350,19 @@ SELECT station_name, max_temperature_group, count(*)
 FROM temps_collapsed
 GROUP BY station_name, max_temperature_group
 ORDER BY station_name, count(*) DESC;
+
+
+
+-- UNUSED
+SELECT county_name,
+       state_name,
+       pop_est_2019,
+       natural_chg,
+       net_migration,
+       natural_chg + net_migration AS total_chg
+FROM us_counties_pop_est_2019,
+    LATERAL (SELECT births_2019 - deaths_2019 AS natural_chg) AS nc, 
+    LATERAL (SELECT international_migr_2019 + domestic_migr_2019 AS net_migration) AS nm
+WHERE (net_migration > 0 AND natural_chg < 0) 
+    AND (abs(net_migration) > abs(natural_chg))
+ORDER BY total_chg DESC;
--- a/Try_It_Yourself/Try_It_Yourself.sql
+++ b/Try_It_Yourself/Try_It_Yourself.sql
@ -796,3 +796,74 @@ WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval;
 -- Note: Both correlations are strong, with r values of 0.80 or higher. We'd
 -- expect this given the cost of a taxi ride is based on both time and distance.

+
+----------------------------------------------------------------------------
+-- Chapter 13: Advanced Query Techniques
+----------------------------------------------------------------------------
+
+-- 1. Revise the code in Listing 13-21 to dig deeper into the nuances of
+-- Waikiki’s high temperatures. Limit the temps_collapsed table to the Waikiki
+-- maximum daily temperature observations. Then use the WHEN clauses in the
+-- CASE statement to reclassify the temperatures into seven groups that would
+-- result in the following text output:
+
+-- '90 or more'
+-- '88-89'
+-- '86-87'
+-- '84-85'
+-- '82-83'
+-- '80-81'
+-- '79 or less'
+
+-- In which of those groups does Waikiki’s daily maximum temperature fall most
+-- often?
+
+-- Answer: Between 86 and 87 degrees. Nice.
+
+WITH temps_collapsed (station_name, max_temperature_group) AS
+    (SELECT station_name,
+           CASE WHEN max_temp >= 90 THEN '90 or more'
+                WHEN max_temp >= 88 AND max_temp < 90 THEN '88-89'
+                WHEN max_temp >= 86 AND max_temp < 88 THEN '86-87'
+                WHEN max_temp >= 84 AND max_temp < 86 THEN '84-85'
+                WHEN max_temp >= 82 AND max_temp < 84 THEN '82-83'
+                WHEN max_temp >= 80 AND max_temp < 82 THEN '80-81'
+                WHEN max_temp < 80 THEN '79 or less'
+            END
+    FROM temperature_readings
+    WHERE station_name = 'WAIKIKI 717.2 HI US')
+
+SELECT station_name, max_temperature_group, count(*)
+FROM temps_collapsed
+GROUP BY station_name, max_temperature_group
+ORDER BY max_temperature_group;
+
+-- 2. Revise the ice cream survey crosstab in Listing 13-17 to flip the table.
+-- In other words, make flavor the rows and office the columns. Which elements
+-- of the query do you need to change? Are the counts different?
+
+-- Answer: You need to re-order the columns in the first subquery so flavor is
+-- first and office is second. count(*) stays third. Then, you must change
+-- the second subquery to produce a grouped list of office. Finally, you must
+-- add the office names to the output list.
+
+-- The numbers don't change, just the order presented in the crosstab.
+
+SELECT *
+FROM crosstab('SELECT flavor,
+                      office,
+                      count(*)
+               FROM ice_cream_survey
+               GROUP BY flavor, office
+               ORDER BY flavor',
+
+              'SELECT office
+               FROM ice_cream_survey
+               GROUP BY office
+               ORDER BY office')
+
+AS (flavor text,
+    downtown bigint,
+    midtown bigint,
+    uptown bigint);
+