Updates to Chapter 6 and Try It Yourself

This commit is contained in:
anthonydb 2020-06-28 16:56:15 -04:00
parent b0f23d78a4
commit 360bade5a2
2 changed files with 185 additions and 158 deletions

View File

@ -1,9 +1,9 @@
--------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data
---------------------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
-- by Anthony DeBarros
-- Chapter 6 Code Examples
--------------------------------------------------------------
----------------------------------------------------------------------------
-- Listing 6-1: Basic addition, subtraction and multiplication with SQL
@ -34,79 +34,81 @@ SELECT (7 + 8) * 9; -- answer: 135
SELECT 3 ^ 3 - 1; -- answer: 26
SELECT 3 ^ (3 - 1); -- answer: 9
-- Listing 6-4: Selecting Census population columns by race with aliases
-- Listing 6-4: Selecting Census population estimate columns with aliases
SELECT geo_name,
state_us_abbreviation AS "st",
p0010001 AS "Total Population",
p0010003 AS "White Alone",
p0010004 AS "Black or African American Alone",
p0010005 AS "Am Indian/Alaska Native Alone",
p0010006 AS "Asian Alone",
p0010007 AS "Native Hawaiian and Other Pacific Islander Alone",
p0010008 AS "Some Other Race Alone",
p0010009 AS "Two or More Races"
FROM us_counties_2010;
SELECT county_name AS county,
state_name AS state,
pop_est_2019 AS pop,
births_2019 AS births,
deaths_2019 AS deaths,
international_migr_2019 AS int_migr,
domestic_migr_2019 AS dom_migr,
residual_2019 AS residual
FROM us_counties_pop_est_2019;
-- Listing 6-5: Adding two columns in us_counties_2010
SELECT * FROM us_counties_pop_est_2019;
SELECT geo_name,
state_us_abbreviation AS "st",
p0010003 AS "White Alone",
p0010004 AS "Black Alone",
p0010003 + p0010004 AS "Total White and Black"
FROM us_counties_2010;
-- Listing 6-5: Subtracting two columns in us_counties_pop_est_2019
SELECT county_name AS county,
state_name AS state,
births_2019 AS births,
deaths_2019 AS deaths,
births_2019 - deaths_2019 AS natural_increase
FROM us_counties_pop_est_2019;
-- Listing 6-6: Checking Census data totals
SELECT geo_name,
state_us_abbreviation AS "st",
p0010001 AS "Total",
p0010003 + p0010004 + p0010005 + p0010006 + p0010007
+ p0010008 + p0010009 AS "All Races",
(p0010003 + p0010004 + p0010005 + p0010006 + p0010007
+ p0010008 + p0010009) - p0010001 AS "Difference"
FROM us_counties_2010
ORDER BY "Difference" DESC;
SELECT county_name AS county,
state_name AS state,
pop_est_2019 AS pop,
pop_est_2018 + births_2019 - deaths_2019 +
international_migr_2019 + domestic_migr_2019 +
residual_2019 AS components_total,
pop_est_2019 - (pop_est_2018 + births_2019 - deaths_2019 +
international_migr_2019 + domestic_migr_2019 +
residual_2019) AS difference
FROM us_counties_pop_est_2019
ORDER BY difference DESC;
-- Listing 6-7: Calculating the percent of the population that is
-- Asian by county (percent of the whole)
-- Listing 6-7: Calculating the percent of a county's area that is water
SELECT county_name AS county,
state_name AS state,
area_water::numeric / (area_land + area_water) * 100 AS pct_water
FROM us_counties_pop_est_2019
ORDER BY pct_water DESC;
SELECT geo_name,
state_us_abbreviation AS "st",
(CAST(p0010006 AS numeric(8,1)) / p0010001) * 100 AS "pct_asian"
FROM us_counties_2010
ORDER BY "pct_asian" DESC;
-- Listing 6-8: Calculating percent change
CREATE TABLE percent_change (
department varchar(20),
spend_2014 numeric(10,2),
spend_2017 numeric(10,2)
department text,
spend_2019 numeric(10,2),
spend_2022 numeric(10,2)
);
INSERT INTO percent_change
VALUES
('Building', 250000, 289000),
('Assessor', 178556, 179500),
('Library', 87777, 90001),
('Building', 250000, 289000),
('Clerk', 451980, 650000),
('Police', 250000, 223000),
('Recreation', 199000, 195000);
('Library', 87777, 90001),
('Parks', 250000, 223000),
('Water', 199000, 195000);
SELECT department,
spend_2014,
spend_2017,
round( (spend_2017 - spend_2014) /
spend_2014 * 100, 1 ) AS "pct_change"
spend_2019,
spend_2022,
round( (spend_2022 - spend_2019) /
spend_2019 * 100, 1 ) AS pct_change
FROM percent_change;
-- Listing 6-9: Using sum() and avg() aggregate functions
SELECT sum(p0010001) AS "County Sum",
round(avg(p0010001), 0) AS "County Average"
FROM us_counties_2010;
SELECT sum(pop_est_2019) AS county_sum,
round(avg(pop_est_2019), 0) AS county_average
FROM us_counties_pop_est_2019;
-- Listing 6-10: Testing SQL percentile functions
@ -126,37 +128,37 @@ FROM percentile_test;
-- Listing 6-11: Using sum(), avg(), and percentile_cont() aggregate functions
SELECT sum(p0010001) AS "County Sum",
round(avg(p0010001), 0) AS "County Average",
SELECT sum(pop_est_2019) AS county_sum,
round(avg(pop_est_2019), 0) AS county_average,
percentile_cont(.5)
WITHIN GROUP (ORDER BY p0010001) AS "County Median"
FROM us_counties_2010;
WITHIN GROUP (ORDER BY pop_est_2019) AS county_median
FROM us_counties_pop_est_2019;
select * from us_counties_pop_est_2019 order by pop_est_2019 asc;
-- Listing 6-12: Passing an array of values to percentile_cont()
-- quartiles
SELECT percentile_cont(array[.25,.5,.75])
WITHIN GROUP (ORDER BY p0010001) AS "quartiles"
FROM us_counties_2010;
WITHIN GROUP (ORDER BY pop_est_2019) AS quartiles
FROM us_counties_pop_est_2019;
-- Extra:
-- quintiles
SELECT percentile_cont(array[.2,.4,.6,.8])
WITHIN GROUP (ORDER BY p0010001) AS "quintiles"
FROM us_counties_2010;
WITHIN GROUP (ORDER BY pop_est_2019) AS quintiles
FROM us_counties_pop_est_2019;
-- deciles
SELECT percentile_cont(array[.1,.2,.3,.4,.5,.6,.7,.8,.9])
WITHIN GROUP (ORDER BY p0010001) AS "deciles"
FROM us_counties_2010;
WITHIN GROUP (ORDER BY pop_est_2019) AS deciles
FROM us_counties_pop_est_2019;
-- Listing 6-13: Using unnest() to turn an array into rows
SELECT unnest(
percentile_cont(array[.25,.5,.75])
WITHIN GROUP (ORDER BY p0010001)
) AS "quartiles"
FROM us_counties_2010;
WITHIN GROUP (ORDER BY pop_est_2019)
) AS quartiles
FROM us_counties_pop_est_2019;
-- Listing 6-14: Creating a median() aggregate function in PostgreSQL
-- Source: https://wiki.postgresql.org/wiki/Aggregate_Median
@ -194,14 +196,14 @@ CREATE AGGREGATE median(anyelement) (
-- Listing 6-15: Using a median() aggregate function
SELECT sum(p0010001) AS "County Sum",
round(avg(p0010001), 0) AS "County Average",
median(p0010001) AS "County Median",
SELECT sum(pop_est_2019) AS county_sum,
round(avg(pop_est_2019), 0) AS county_average,
median(pop_est_2019) AS county_median_func,
percentile_cont(.5)
WITHIN GROUP (ORDER BY P0010001) AS "50th Percentile"
FROM us_counties_2010;
WITHIN GROUP (ORDER BY pop_est_2019) AS county_median_perc
FROM us_counties_pop_est_2019;
-- Listing 6-16: Finding the most-frequent value with mode()
SELECT mode() WITHIN GROUP (ORDER BY p0010001)
FROM us_counties_2010;
SELECT mode() WITHIN GROUP (ORDER BY pop_est_2019)
FROM us_counties_pop_est_2019;

View File

@ -5,9 +5,11 @@
-- Try It Yourself Questions and Answers
----------------------------------------------------------------------------
--------------------------------------------------------------
----------------------------------------------------------------------------
-- Chapter 2: Creating Your First Database and Table
--------------------------------------------------------------
----------------------------------------------------------------------------
-- 1. Imagine you're building a database to catalog all the animals at your
-- local zoo. You want one table for tracking all the kinds of animals and
@ -20,27 +22,30 @@
-- The first table will hold the animal types and their conservation status:
CREATE TABLE animal_types (
animal_type_id bigserial CONSTRAINT animal_types_key PRIMARY KEY,
common_name varchar(100) NOT NULL,
scientific_name varchar(100) NOT NULL,
conservation_status varchar(50) NOT NULL
animal_type_id bigint PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
common_name text NOT NULL,
scientific_name text NOT NULL,
conservation_status text NOT NULL,
CONSTRAINT common_name_unique UNIQUE (common_name)
);
-- Note that I have added keywords on some columns that define constraints
-- such as a PRIMARY KEY. You will learn about these in Chapters 6 and 7.
-- It's OK if your answer doesn't have all the keywords in the example above. Those
-- keywords reference concepts you'll learn in later chapters, including table
-- constraints, primary keys and and IDENTITY columns. What's important is that you
-- considered the individual data items you would want to track.
-- The second table will hold data on individual animals. Note that the
-- column animal_type_id references the column of the same name in the
-- table animal types. This is a foreign key, which you will learn about in
-- Chapter 7.
-- Chapter 8.
CREATE TABLE menagerie (
menagerie_id bigserial CONSTRAINT menagerie_key PRIMARY KEY,
animal_type_id bigint REFERENCES animal_types (animal_type_id),
menagerie_id bigint PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
common_name text REFERENCES animal_types (common_name),
date_acquired date NOT NULL,
gender varchar(1),
acquired_from varchar(100),
name varchar(100),
gender text,
acquired_from text,
name text,
notes text
);
@ -54,15 +59,16 @@ VALUES ('Bengal Tiger', 'Panthera tigris tigris', 'Endangered'),
('Arctic Wolf', 'Canis lupus arctos', 'Least Concern');
-- data source: https://www.worldwildlife.org/species/directory?direction=desc&sort=extinction_status
INSERT INTO menagerie (animal_type_id, date_acquired, gender, acquired_from, name, notes)
INSERT INTO menagerie (common_name, date_acquired, gender, acquired_from, name, notes)
VALUES
(1, '3/12/1996', 'F', 'Dhaka Zoo', 'Ariel', 'Healthy coat at last exam.'),
(2, '9/30/2000', 'F', 'National Zoo', 'Freddy', 'Strong appetite.');
('Bengal Tiger', '3/12/1996', 'F', 'Dhaka Zoo', 'Ariel', 'Healthy coat at last exam.'),
('Arctic Wolf', '9/30/2000', 'F', 'National Zoo', 'Freddy', 'Strong appetite.');
-- To view data via pgAdmin, in the object browser, right-click Tables and
-- select Refresh. Then right-click the table name and select
-- View/Edit Data > All Rows
-- 2b. Create an additional INSERT statement for one of your tables. On purpose,
-- leave out one of the required commas separating the entries in the VALUES
-- clause of the query. What is the error message? Does it help you find the
@ -73,9 +79,10 @@ VALUES
INSERT INTO animal_types (common_name, scientific_name, conservation_status)
VALUES ('Javan Rhino', 'Rhinoceros sondaicus' 'Critically Endangered');
--------------------------------------------------------------
----------------------------------------------------------------------------
-- Chapter 3: Beginning Data Exploration with SELECT
--------------------------------------------------------------
----------------------------------------------------------------------------
-- 1. The school district superintendent asks for a list of teachers in each
-- school. Write a query that lists the schools in alphabetical order along
@ -107,9 +114,9 @@ WHERE hire_date >= '2010-01-01'
ORDER BY salary DESC;
--------------------------------------------------------------
----------------------------------------------------------------------------
-- Chapter 4: Understanding Data Types
--------------------------------------------------------------
----------------------------------------------------------------------------
-- 1. Your company delivers fruit and vegetables to local grocery stores, and
-- you need to track the mileage driven by each driver each day to a tenth
@ -136,11 +143,16 @@ numeric(4,1)
-- Answer:
varchar(50)
-- or
text
-- 50 characters is a reasonable length for names, and varchar() ensures you
-- will not waste space when names are shorter. Separating first and last names
-- 50 characters is a reasonable length for names, and using either varchar(50)
-- or text ensures you will not waste space when names are shorter. Using text will
-- ensure that if you run into the exceptionally rare circumstance of a name longer
-- than 50 characters, you'll be covered. Also, separating first and last names
-- into their own columns will let you later sort on each independently.
-- 3. Assume you have a text column that includes strings formatted as dates.
-- One of the strings is written as '4//2017'. What will happen when you try
-- to convert that string to the timestamp data type?
@ -152,15 +164,15 @@ varchar(50)
SELECT CAST('4//2021' AS timestamp with time zone);
--------------------------------------------------------------
----------------------------------------------------------------------------
-- Chapter 5: Importing and Exporting Data
--------------------------------------------------------------
----------------------------------------------------------------------------
-- 1. Write a WITH statement to include with COPY to handle the import of an
-- imaginary text file that has a first couple of rows that look like this:
-- id:movie:actor
-- 50:#Mission: Impossible#:Tom Cruise
id:movie:actor
50:#Mission: Impossible#:Tom Cruise
-- Answer: The WITH statement will need the options seen here:
@ -179,13 +191,14 @@ CREATE TABLE actors (
);
-- Note: You may never encounter a file that uses a colon as a delimiter and
-- and pound sign for quoting, but anything is possible.
-- pound sign for quoting, but anything is possible!
-- 2. Using the table us_counties_pop_est_2019 you created and filled in this chapter,
-- export to a CSV file the 20 counties in the United States that have the most
-- housing units. Make sure you export only each county's name, state, and
-- number of housing units. (Hint: Housing units are totaled for each county in
-- the column housing_unit_count_100_percent.
-- 2. Using the table us_counties_pop_est_2019 you created and filled in this
-- chapter, export to a CSV file the 20 counties in the United States that had
-- the most births. Make sure you export only each countys name, state, and
-- number of births. (Hint: births are totaled for each county in the column
-- births_2019.)
-- Answer:
@ -193,12 +206,13 @@ COPY (
SELECT county_name, state_name, births_2019
FROM us_counties_pop_est_2019 ORDER BY births_2019 DESC LIMIT 20
)
TO 'C:\YourDirectory\us_counties_housing_export.txt'
TO 'C:\YourDirectory\us_counties_births_export.txt'
WITH (FORMAT CSV, HEADER);
-- Note: This COPY statement uses a SELECT statement to limit the output to
-- only the desired columns and rows.
-- 3. Imagine you're importing a file that contains a column with these values:
-- 17519.668
-- 20084.461
@ -212,9 +226,9 @@ WITH (FORMAT CSV, HEADER);
-- type for the example data is numeric(8,3).
--------------------------------------------------------------
-- Chapter 5: Basic Math and Stats with SQL
--------------------------------------------------------------
----------------------------------------------------------------------------
-- Chapter 6: Basic Math and Stats with SQL
----------------------------------------------------------------------------
-- 1. Write a SQL statement for calculating the area of a circle whose radius is
-- 5 inches. Do you need parentheses in your calculation? Why or why not?
@ -231,64 +245,68 @@ SELECT 3.14 * 5 ^ 2;
SELECT 3.14 * (5 ^ 2);
-- 2. Using the 2010 Census county data, find out which New York state county
-- has the highest percentage of the population that identified as "American
-- Indian/Alaska Native Alone." What can you learn about that county from online
-- research that explains the relatively large proportion of American Indian
-- population compared with other New York counties?
-- 2. Using the 2019 Census county estimates data, calculate a ratio of births to
-- deaths for each county in New York state. Which region of the state generally
-- saw a higher ratio of births to deaths in 2019?
-- Answer:
-- Franklin County, N.Y., with 7.4%. The county contains the St. Regis Mohawk
-- Reservation. https://en.wikipedia.org/wiki/St._Regis_Mohawk_Reservation
SELECT geo_name,
state_us_abbreviation,
p0010001 AS total_population,
p0010005 AS american_indian_alaska_native_alone,
(CAST (p0010005 AS numeric(8,1)) / p0010001) * 100
AS percent_american_indian_alaska_native_alone
FROM us_counties_2010
WHERE state_us_abbreviation = 'NY'
ORDER BY percent_american_indian_alaska_native_alone DESC;
SELECT county_name,
state_name,
births_2019 AS births,
deaths_2019 AS DEATHS,
births_2019::numeric / deaths_2019 AS birth_death_ratio
FROM us_counties_pop_est_2019
WHERE state_name = 'New York'
ORDER BY birth_death_ratio DESC;
-- 3. Was the 2010 median county population higher in California or New York?
-- Generally, counties in and around New York City had the highest ratio of births
-- to deaths in the 2019 estimates. One exception to the trend is Jefferson County,
-- which is upstate on the U.S./Canadian border.
-- 3. Was the 2019 median county population estimate higher in California or New York?
-- Answer:
-- California had a median county population of 179,140.5 in 2010, almost double
-- that of New York, at 91,301. Here are two solutions:
-- California had a median county population estimate of 187,029 in 2019, almost double
-- that of New York, at 86,687. Here are two solutions:
-- First, you can find the median for each state one at a time:
SELECT percentile_cont(.5)
WITHIN GROUP (ORDER BY p0010001)
FROM us_counties_2010
WHERE state_us_abbreviation = 'NY';
WITHIN GROUP (ORDER BY pop_est_2019)
FROM us_counties_pop_est_2019
WHERE state_name = 'New York';
SELECT percentile_cont(.5)
WITHIN GROUP (ORDER BY p0010001)
FROM us_counties_2010
WHERE state_us_abbreviation = 'CA';
WITHIN GROUP (ORDER BY pop_est_2019)
FROM us_counties_pop_est_2019
WHERE state_name = 'California';
-- Or both in one query (credit: https://github.com/Kennith-eng)
SELECT state_us_abbreviation,
SELECT state_name,
percentile_cont(0.5)
WITHIN GROUP (ORDER BY p0010001) AS median
FROM us_counties_2010
WHERE state_us_abbreviation IN ('NY', 'CA')
GROUP BY state_us_abbreviation;
WITHIN GROUP (ORDER BY pop_est_2019) AS median
FROM us_counties_pop_est_2019
WHERE state_name IN ('New York', 'California')
GROUP BY state_name;
-- Finally, this query shows the median for each state:
SELECT state_us_abbreviation,
SELECT state_name,
percentile_cont(0.5)
WITHIN GROUP (ORDER BY p0010001) AS median
FROM us_counties_2010
GROUP BY state_us_abbreviation;
WITHIN GROUP (ORDER BY pop_est_2019) AS median
FROM us_counties_pop_est_2019
GROUP BY state_name;
--------------------------------------------------------------
-- Chapter 6: Joining Tables in a Relational Database
-- Chapter 7: Joining Tables in a Relational Database
--------------------------------------------------------------
-- 1. The table us_counties_2010 contains 3,143 rows, and us_counties_2000 has
@ -368,7 +386,7 @@ ON c2010.state_fips = c2000.state_fips
ORDER BY pct_change ASC;
--------------------------------------------------------------
-- Chapter 7: Table Design that Works for You
-- Chapter 8: Table Design that Works for You
--------------------------------------------------------------
-- Consider the following two tables from a database youre making to keep
@ -461,7 +479,7 @@ CREATE TABLE songs (
----------------------------------------------------------------
-- Chapter 8: Extracting Information by Grouping and Summarizing
-- Chapter 9: Extracting Information by Grouping and Summarizing
----------------------------------------------------------------
-- 1. We saw that library visits have declined in most places. But what is the
@ -582,7 +600,7 @@ WHERE pls14.fscskey IS NULL OR pls09.fscskey IS NULL;
-- that do not appear in both tables.
--------------------------------------------------------------
-- Chapter 9: Inspecting and Modifying Data
-- Chapter 10: Inspecting and Modifying Data
--------------------------------------------------------------
-- In this exercise, youll turn the meat_poultry_egg_inspect table into useful
@ -636,7 +654,7 @@ WHERE meat_processing = TRUE AND
poultry_processing = TRUE;
--------------------------------------------------------------
-- Chapter 10: Statistical Functions in SQL
-- Chapter 11: Statistical Functions in SQL
--------------------------------------------------------------
-- 1. In Listing 10-2, the correlation coefficient, or r value, of the
@ -720,7 +738,7 @@ WHERE popu_lsa >= 250000;
--------------------------------------------------------------
-- Chapter 11: Working with Dates and Times
-- Chapter 12: Working with Dates and Times
--------------------------------------------------------------
-- 1. Using the New York City taxi data, calculate the length of each ride using
@ -787,7 +805,7 @@ WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval;
-- expect this given that cost of a taxi ride is based on both time and distance.
--------------------------------------------------------------
-- Chapter 12: Advanced Query Techniques
-- Chapter 13: Advanced Query Techniques
--------------------------------------------------------------
-- 1. Revise the code in Listing 12-15 to dig deeper into the nuances of
@ -858,7 +876,7 @@ AS (flavor varchar(20),
-------------------------------------------------------------
-- Chapter 13: Mining Text to Find Meaningful Data
-- Chapter 14: Mining Text to Find Meaningful Data
--------------------------------------------------------------
-- 1. The style guide of a publishing company you're writing for wants you to
@ -934,7 +952,7 @@ LIMIT 5;
--------------------------------------------------------------
-- Chapter 14: Analyzing Spatial Data with PostGIS
-- Chapter 15: Analyzing Spatial Data with PostGIS
--------------------------------------------------------------
-- 1. Earlier, you found which US county has the largest area. Now,
@ -1004,7 +1022,14 @@ ORDER BY census.statefp10, census.name10;
-- Can you spot it?
--------------------------------------------------------------
-- Chapter 15: Saving Time with Views, Functions, and Triggers
-- Chapter 16: Working with JSON Data
--------------------------------------------------------------
-- To come ...
--------------------------------------------------------------
-- Chapter 17: Saving Time with Views, Functions, and Triggers
--------------------------------------------------------------
-- 1. Create a view that displays the number of New York City taxi trips per
@ -1085,14 +1110,14 @@ SELECT * FROM meat_poultry_egg_inspect
WHERE company = 'testcompany';
--------------------------------------------------------------
-- Chapter 16: Using PostgreSQL From the Command Line
-- Chapter 18: Using PostgreSQL From the Command Line
--------------------------------------------------------------
-- For this chapter, use psql to review any of the exercises in the book.
--------------------------------------------------------------
-- Chapter 17: Maintaining Your Database
-- Chapter 19: Maintaining Your Database
--------------------------------------------------------------
-- To back up the gis_analysis database, use the pg_dump utility at the command line:
@ -1100,7 +1125,7 @@ WHERE company = 'testcompany';
-----------------------------------------------------------------
-- Chapter 18: Identifying and Telling the Story Behind Your Data
-- Chapter 20: Identifying and Telling the Story Behind Your Data
-----------------------------------------------------------------
-- This is a non-coding chapter.