diff --git a/Chapter_06/Chapter_06.sql b/Chapter_06/Chapter_06.sql index 6e0a973..ac542d2 100644 --- a/Chapter_06/Chapter_06.sql +++ b/Chapter_06/Chapter_06.sql @@ -1,9 +1,9 @@ --------------------------------------------------------------- --- Practical SQL: A Beginner's Guide to Storytelling with Data +--------------------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition -- by Anthony DeBarros -- Chapter 6 Code Examples --------------------------------------------------------------- +---------------------------------------------------------------------------- -- Listing 6-1: Basic addition, subtraction and multiplication with SQL @@ -34,79 +34,81 @@ SELECT (7 + 8) * 9; -- answer: 135 SELECT 3 ^ 3 - 1; -- answer: 26 SELECT 3 ^ (3 - 1); -- answer: 9 --- Listing 6-4: Selecting Census population columns by race with aliases +-- Listing 6-4: Selecting Census population estimate columns with aliases -SELECT geo_name, - state_us_abbreviation AS "st", - p0010001 AS "Total Population", - p0010003 AS "White Alone", - p0010004 AS "Black or African American Alone", - p0010005 AS "Am Indian/Alaska Native Alone", - p0010006 AS "Asian Alone", - p0010007 AS "Native Hawaiian and Other Pacific Islander Alone", - p0010008 AS "Some Other Race Alone", - p0010009 AS "Two or More Races" -FROM us_counties_2010; +SELECT county_name AS county, + state_name AS state, + pop_est_2019 AS pop, + births_2019 AS births, + deaths_2019 AS deaths, + international_migr_2019 AS int_migr, + domestic_migr_2019 AS dom_migr, + residual_2019 AS residual +FROM us_counties_pop_est_2019; --- Listing 6-5: Adding two columns in us_counties_2010 +SELECT * FROM us_counties_pop_est_2019; -SELECT geo_name, - state_us_abbreviation AS "st", - p0010003 AS "White Alone", - p0010004 AS "Black Alone", - p0010003 + p0010004 AS "Total White and Black" -FROM us_counties_2010; +-- Listing 6-5: Subtracting two columns in us_counties_pop_est_2019 + +SELECT county_name AS county, + state_name AS state, + births_2019 AS births, + deaths_2019 AS deaths, + births_2019 - deaths_2019 AS natural_increase +FROM us_counties_pop_est_2019; -- Listing 6-6: Checking Census data totals -SELECT geo_name, - state_us_abbreviation AS "st", - p0010001 AS "Total", - p0010003 + p0010004 + p0010005 + p0010006 + p0010007 - + p0010008 + p0010009 AS "All Races", - (p0010003 + p0010004 + p0010005 + p0010006 + p0010007 - + p0010008 + p0010009) - p0010001 AS "Difference" -FROM us_counties_2010 -ORDER BY "Difference" DESC; +SELECT county_name AS county, + state_name AS state, + pop_est_2019 AS pop, + pop_est_2018 + births_2019 - deaths_2019 + + international_migr_2019 + domestic_migr_2019 + + residual_2019 AS components_total, + pop_est_2019 - (pop_est_2018 + births_2019 - deaths_2019 + + international_migr_2019 + domestic_migr_2019 + + residual_2019) AS difference +FROM us_counties_pop_est_2019 +ORDER BY difference DESC; --- Listing 6-7: Calculating the percent of the population that is --- Asian by county (percent of the whole) +-- Listing 6-7: Calculating the percent of a county's area that is water + +SELECT county_name AS county, + state_name AS state, + area_water::numeric / (area_land + area_water) * 100 AS pct_water +FROM us_counties_pop_est_2019 +ORDER BY pct_water DESC; -SELECT geo_name, - state_us_abbreviation AS "st", - (CAST(p0010006 AS numeric(8,1)) / p0010001) * 100 AS "pct_asian" -FROM us_counties_2010 -ORDER BY "pct_asian" DESC; -- Listing 6-8: Calculating percent change CREATE TABLE percent_change ( - department varchar(20), - spend_2014 numeric(10,2), - spend_2017 numeric(10,2) + department text, + spend_2019 numeric(10,2), + spend_2022 numeric(10,2) ); INSERT INTO percent_change VALUES - ('Building', 250000, 289000), ('Assessor', 178556, 179500), - ('Library', 87777, 90001), + ('Building', 250000, 289000), ('Clerk', 451980, 650000), - ('Police', 250000, 223000), - ('Recreation', 199000, 195000); + ('Library', 87777, 90001), + ('Parks', 250000, 223000), + ('Water', 199000, 195000); SELECT department, - spend_2014, - spend_2017, - round( (spend_2017 - spend_2014) / - spend_2014 * 100, 1 ) AS "pct_change" + spend_2019, + spend_2022, + round( (spend_2022 - spend_2019) / + spend_2019 * 100, 1 ) AS pct_change FROM percent_change; -- Listing 6-9: Using sum() and avg() aggregate functions -SELECT sum(p0010001) AS "County Sum", - round(avg(p0010001), 0) AS "County Average" -FROM us_counties_2010; +SELECT sum(pop_est_2019) AS county_sum, + round(avg(pop_est_2019), 0) AS county_average +FROM us_counties_pop_est_2019; -- Listing 6-10: Testing SQL percentile functions @@ -126,37 +128,37 @@ FROM percentile_test; -- Listing 6-11: Using sum(), avg(), and percentile_cont() aggregate functions -SELECT sum(p0010001) AS "County Sum", - round(avg(p0010001), 0) AS "County Average", +SELECT sum(pop_est_2019) AS county_sum, + round(avg(pop_est_2019), 0) AS county_average, percentile_cont(.5) - WITHIN GROUP (ORDER BY p0010001) AS "County Median" -FROM us_counties_2010; - + WITHIN GROUP (ORDER BY pop_est_2019) AS county_median +FROM us_counties_pop_est_2019; +select * from us_counties_pop_est_2019 order by pop_est_2019 asc; -- Listing 6-12: Passing an array of values to percentile_cont() -- quartiles SELECT percentile_cont(array[.25,.5,.75]) - WITHIN GROUP (ORDER BY p0010001) AS "quartiles" -FROM us_counties_2010; + WITHIN GROUP (ORDER BY pop_est_2019) AS quartiles +FROM us_counties_pop_est_2019; -- Extra: -- quintiles SELECT percentile_cont(array[.2,.4,.6,.8]) - WITHIN GROUP (ORDER BY p0010001) AS "quintiles" -FROM us_counties_2010; + WITHIN GROUP (ORDER BY pop_est_2019) AS quintiles +FROM us_counties_pop_est_2019; -- deciles SELECT percentile_cont(array[.1,.2,.3,.4,.5,.6,.7,.8,.9]) - WITHIN GROUP (ORDER BY p0010001) AS "deciles" -FROM us_counties_2010; + WITHIN GROUP (ORDER BY pop_est_2019) AS deciles +FROM us_counties_pop_est_2019; -- Listing 6-13: Using unnest() to turn an array into rows SELECT unnest( percentile_cont(array[.25,.5,.75]) - WITHIN GROUP (ORDER BY p0010001) - ) AS "quartiles" -FROM us_counties_2010; + WITHIN GROUP (ORDER BY pop_est_2019) + ) AS quartiles +FROM us_counties_pop_est_2019; -- Listing 6-14: Creating a median() aggregate function in PostgreSQL -- Source: https://wiki.postgresql.org/wiki/Aggregate_Median @@ -194,14 +196,14 @@ CREATE AGGREGATE median(anyelement) ( -- Listing 6-15: Using a median() aggregate function -SELECT sum(p0010001) AS "County Sum", - round(avg(p0010001), 0) AS "County Average", - median(p0010001) AS "County Median", +SELECT sum(pop_est_2019) AS county_sum, + round(avg(pop_est_2019), 0) AS county_average, + median(pop_est_2019) AS county_median_func, percentile_cont(.5) - WITHIN GROUP (ORDER BY P0010001) AS "50th Percentile" -FROM us_counties_2010; + WITHIN GROUP (ORDER BY pop_est_2019) AS county_median_perc +FROM us_counties_pop_est_2019; -- Listing 6-16: Finding the most-frequent value with mode() -SELECT mode() WITHIN GROUP (ORDER BY p0010001) -FROM us_counties_2010; +SELECT mode() WITHIN GROUP (ORDER BY pop_est_2019) +FROM us_counties_pop_est_2019; diff --git a/Try_It_Yourself/Try_It_Yourself.sql b/Try_It_Yourself/Try_It_Yourself.sql index 7de8433..0e4df83 100644 --- a/Try_It_Yourself/Try_It_Yourself.sql +++ b/Try_It_Yourself/Try_It_Yourself.sql @@ -5,9 +5,11 @@ -- Try It Yourself Questions and Answers ---------------------------------------------------------------------------- --------------------------------------------------------------- + + +---------------------------------------------------------------------------- -- Chapter 2: Creating Your First Database and Table --------------------------------------------------------------- +---------------------------------------------------------------------------- -- 1. Imagine you're building a database to catalog all the animals at your -- local zoo. You want one table for tracking all the kinds of animals and @@ -20,27 +22,30 @@ -- The first table will hold the animal types and their conservation status: CREATE TABLE animal_types ( - animal_type_id bigserial CONSTRAINT animal_types_key PRIMARY KEY, - common_name varchar(100) NOT NULL, - scientific_name varchar(100) NOT NULL, - conservation_status varchar(50) NOT NULL + animal_type_id bigint PRIMARY KEY GENERATED ALWAYS AS IDENTITY, + common_name text NOT NULL, + scientific_name text NOT NULL, + conservation_status text NOT NULL, + CONSTRAINT common_name_unique UNIQUE (common_name) ); --- Note that I have added keywords on some columns that define constraints --- such as a PRIMARY KEY. You will learn about these in Chapters 6 and 7. +-- It's OK if your answer doesn't have all the keywords in the example above. Those +-- keywords reference concepts you'll learn in later chapters, including table +-- constraints, primary keys and and IDENTITY columns. What's important is that you +-- considered the individual data items you would want to track. -- The second table will hold data on individual animals. Note that the -- column animal_type_id references the column of the same name in the -- table animal types. This is a foreign key, which you will learn about in --- Chapter 7. +-- Chapter 8. CREATE TABLE menagerie ( - menagerie_id bigserial CONSTRAINT menagerie_key PRIMARY KEY, - animal_type_id bigint REFERENCES animal_types (animal_type_id), + menagerie_id bigint PRIMARY KEY GENERATED ALWAYS AS IDENTITY, + common_name text REFERENCES animal_types (common_name), date_acquired date NOT NULL, - gender varchar(1), - acquired_from varchar(100), - name varchar(100), + gender text, + acquired_from text, + name text, notes text ); @@ -54,15 +59,16 @@ VALUES ('Bengal Tiger', 'Panthera tigris tigris', 'Endangered'), ('Arctic Wolf', 'Canis lupus arctos', 'Least Concern'); -- data source: https://www.worldwildlife.org/species/directory?direction=desc&sort=extinction_status -INSERT INTO menagerie (animal_type_id, date_acquired, gender, acquired_from, name, notes) +INSERT INTO menagerie (common_name, date_acquired, gender, acquired_from, name, notes) VALUES -(1, '3/12/1996', 'F', 'Dhaka Zoo', 'Ariel', 'Healthy coat at last exam.'), -(2, '9/30/2000', 'F', 'National Zoo', 'Freddy', 'Strong appetite.'); +('Bengal Tiger', '3/12/1996', 'F', 'Dhaka Zoo', 'Ariel', 'Healthy coat at last exam.'), +('Arctic Wolf', '9/30/2000', 'F', 'National Zoo', 'Freddy', 'Strong appetite.'); -- To view data via pgAdmin, in the object browser, right-click Tables and -- select Refresh. Then right-click the table name and select -- View/Edit Data > All Rows + -- 2b. Create an additional INSERT statement for one of your tables. On purpose, -- leave out one of the required commas separating the entries in the VALUES -- clause of the query. What is the error message? Does it help you find the @@ -73,9 +79,10 @@ VALUES INSERT INTO animal_types (common_name, scientific_name, conservation_status) VALUES ('Javan Rhino', 'Rhinoceros sondaicus' 'Critically Endangered'); --------------------------------------------------------------- + +---------------------------------------------------------------------------- -- Chapter 3: Beginning Data Exploration with SELECT --------------------------------------------------------------- +---------------------------------------------------------------------------- -- 1. The school district superintendent asks for a list of teachers in each -- school. Write a query that lists the schools in alphabetical order along @@ -107,9 +114,9 @@ WHERE hire_date >= '2010-01-01' ORDER BY salary DESC; --------------------------------------------------------------- +---------------------------------------------------------------------------- -- Chapter 4: Understanding Data Types --------------------------------------------------------------- +---------------------------------------------------------------------------- -- 1. Your company delivers fruit and vegetables to local grocery stores, and -- you need to track the mileage driven by each driver each day to a tenth @@ -136,11 +143,16 @@ numeric(4,1) -- Answer: varchar(50) +-- or +text --- 50 characters is a reasonable length for names, and varchar() ensures you --- will not waste space when names are shorter. Separating first and last names +-- 50 characters is a reasonable length for names, and using either varchar(50) +-- or text ensures you will not waste space when names are shorter. Using text will +-- ensure that if you run into the exceptionally rare circumstance of a name longer +-- than 50 characters, you'll be covered. Also, separating first and last names -- into their own columns will let you later sort on each independently. + -- 3. Assume you have a text column that includes strings formatted as dates. -- One of the strings is written as '4//2017'. What will happen when you try -- to convert that string to the timestamp data type? @@ -152,15 +164,15 @@ varchar(50) SELECT CAST('4//2021' AS timestamp with time zone); --------------------------------------------------------------- +---------------------------------------------------------------------------- -- Chapter 5: Importing and Exporting Data --------------------------------------------------------------- +---------------------------------------------------------------------------- -- 1. Write a WITH statement to include with COPY to handle the import of an -- imaginary text file that has a first couple of rows that look like this: --- id:movie:actor --- 50:#Mission: Impossible#:Tom Cruise +id:movie:actor +50:#Mission: Impossible#:Tom Cruise -- Answer: The WITH statement will need the options seen here: @@ -179,13 +191,14 @@ CREATE TABLE actors ( ); -- Note: You may never encounter a file that uses a colon as a delimiter and --- and pound sign for quoting, but anything is possible. +-- pound sign for quoting, but anything is possible! --- 2. Using the table us_counties_pop_est_2019 you created and filled in this chapter, --- export to a CSV file the 20 counties in the United States that have the most --- housing units. Make sure you export only each county's name, state, and --- number of housing units. (Hint: Housing units are totaled for each county in --- the column housing_unit_count_100_percent. + +-- 2. Using the table us_counties_pop_est_2019 you created and filled in this +-- chapter, export to a CSV file the 20 counties in the United States that had +-- the most births. Make sure you export only each county’s name, state, and +-- number of births. (Hint: births are totaled for each county in the column +-- births_2019.) -- Answer: @@ -193,12 +206,13 @@ COPY ( SELECT county_name, state_name, births_2019 FROM us_counties_pop_est_2019 ORDER BY births_2019 DESC LIMIT 20 ) -TO 'C:\YourDirectory\us_counties_housing_export.txt' +TO 'C:\YourDirectory\us_counties_births_export.txt' WITH (FORMAT CSV, HEADER); -- Note: This COPY statement uses a SELECT statement to limit the output to -- only the desired columns and rows. + -- 3. Imagine you're importing a file that contains a column with these values: -- 17519.668 -- 20084.461 @@ -212,9 +226,9 @@ WITH (FORMAT CSV, HEADER); -- type for the example data is numeric(8,3). --------------------------------------------------------------- --- Chapter 5: Basic Math and Stats with SQL --------------------------------------------------------------- +---------------------------------------------------------------------------- +-- Chapter 6: Basic Math and Stats with SQL +---------------------------------------------------------------------------- -- 1. Write a SQL statement for calculating the area of a circle whose radius is -- 5 inches. Do you need parentheses in your calculation? Why or why not? @@ -231,64 +245,68 @@ SELECT 3.14 * 5 ^ 2; SELECT 3.14 * (5 ^ 2); --- 2. Using the 2010 Census county data, find out which New York state county --- has the highest percentage of the population that identified as "American --- Indian/Alaska Native Alone." What can you learn about that county from online --- research that explains the relatively large proportion of American Indian --- population compared with other New York counties? + +-- 2. Using the 2019 Census county estimates data, calculate a ratio of births to +-- deaths for each county in New York state. Which region of the state generally +-- saw a higher ratio of births to deaths in 2019? -- Answer: --- Franklin County, N.Y., with 7.4%. The county contains the St. Regis Mohawk --- Reservation. https://en.wikipedia.org/wiki/St._Regis_Mohawk_Reservation -SELECT geo_name, - state_us_abbreviation, - p0010001 AS total_population, - p0010005 AS american_indian_alaska_native_alone, - (CAST (p0010005 AS numeric(8,1)) / p0010001) * 100 - AS percent_american_indian_alaska_native_alone -FROM us_counties_2010 -WHERE state_us_abbreviation = 'NY' -ORDER BY percent_american_indian_alaska_native_alone DESC; +SELECT county_name, + state_name, + births_2019 AS births, + deaths_2019 AS DEATHS, + births_2019::numeric / deaths_2019 AS birth_death_ratio +FROM us_counties_pop_est_2019 +WHERE state_name = 'New York' +ORDER BY birth_death_ratio DESC; --- 3. Was the 2010 median county population higher in California or New York? +-- Generally, counties in and around New York City had the highest ratio of births +-- to deaths in the 2019 estimates. One exception to the trend is Jefferson County, +-- which is upstate on the U.S./Canadian border. + + +-- 3. Was the 2019 median county population estimate higher in California or New York? -- Answer: --- California had a median county population of 179,140.5 in 2010, almost double --- that of New York, at 91,301. Here are two solutions: +-- California had a median county population estimate of 187,029 in 2019, almost double +-- that of New York, at 86,687. Here are two solutions: -- First, you can find the median for each state one at a time: SELECT percentile_cont(.5) - WITHIN GROUP (ORDER BY p0010001) -FROM us_counties_2010 -WHERE state_us_abbreviation = 'NY'; + WITHIN GROUP (ORDER BY pop_est_2019) +FROM us_counties_pop_est_2019 +WHERE state_name = 'New York'; SELECT percentile_cont(.5) - WITHIN GROUP (ORDER BY p0010001) -FROM us_counties_2010 -WHERE state_us_abbreviation = 'CA'; + WITHIN GROUP (ORDER BY pop_est_2019) +FROM us_counties_pop_est_2019 +WHERE state_name = 'California'; -- Or both in one query (credit: https://github.com/Kennith-eng) -SELECT state_us_abbreviation, +SELECT state_name, percentile_cont(0.5) - WITHIN GROUP (ORDER BY p0010001) AS median -FROM us_counties_2010 -WHERE state_us_abbreviation IN ('NY', 'CA') -GROUP BY state_us_abbreviation; - + WITHIN GROUP (ORDER BY pop_est_2019) AS median +FROM us_counties_pop_est_2019 +WHERE state_name IN ('New York', 'California') +GROUP BY state_name; + -- Finally, this query shows the median for each state: -SELECT state_us_abbreviation, +SELECT state_name, percentile_cont(0.5) - WITHIN GROUP (ORDER BY p0010001) AS median -FROM us_counties_2010 -GROUP BY state_us_abbreviation; + WITHIN GROUP (ORDER BY pop_est_2019) AS median +FROM us_counties_pop_est_2019 +GROUP BY state_name; + + + -------------------------------------------------------------- --- Chapter 6: Joining Tables in a Relational Database +-- Chapter 7: Joining Tables in a Relational Database -------------------------------------------------------------- -- 1. The table us_counties_2010 contains 3,143 rows, and us_counties_2000 has @@ -368,7 +386,7 @@ ON c2010.state_fips = c2000.state_fips ORDER BY pct_change ASC; -------------------------------------------------------------- --- Chapter 7: Table Design that Works for You +-- Chapter 8: Table Design that Works for You -------------------------------------------------------------- -- Consider the following two tables from a database you’re making to keep @@ -461,7 +479,7 @@ CREATE TABLE songs ( ---------------------------------------------------------------- --- Chapter 8: Extracting Information by Grouping and Summarizing +-- Chapter 9: Extracting Information by Grouping and Summarizing ---------------------------------------------------------------- -- 1. We saw that library visits have declined in most places. But what is the @@ -582,7 +600,7 @@ WHERE pls14.fscskey IS NULL OR pls09.fscskey IS NULL; -- that do not appear in both tables. -------------------------------------------------------------- --- Chapter 9: Inspecting and Modifying Data +-- Chapter 10: Inspecting and Modifying Data -------------------------------------------------------------- -- In this exercise, you’ll turn the meat_poultry_egg_inspect table into useful @@ -636,7 +654,7 @@ WHERE meat_processing = TRUE AND poultry_processing = TRUE; -------------------------------------------------------------- --- Chapter 10: Statistical Functions in SQL +-- Chapter 11: Statistical Functions in SQL -------------------------------------------------------------- -- 1. In Listing 10-2, the correlation coefficient, or r value, of the @@ -720,7 +738,7 @@ WHERE popu_lsa >= 250000; -------------------------------------------------------------- --- Chapter 11: Working with Dates and Times +-- Chapter 12: Working with Dates and Times -------------------------------------------------------------- -- 1. Using the New York City taxi data, calculate the length of each ride using @@ -787,7 +805,7 @@ WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval; -- expect this given that cost of a taxi ride is based on both time and distance. -------------------------------------------------------------- --- Chapter 12: Advanced Query Techniques +-- Chapter 13: Advanced Query Techniques -------------------------------------------------------------- -- 1. Revise the code in Listing 12-15 to dig deeper into the nuances of @@ -858,7 +876,7 @@ AS (flavor varchar(20), ------------------------------------------------------------- --- Chapter 13: Mining Text to Find Meaningful Data +-- Chapter 14: Mining Text to Find Meaningful Data -------------------------------------------------------------- -- 1. The style guide of a publishing company you're writing for wants you to @@ -934,7 +952,7 @@ LIMIT 5; -------------------------------------------------------------- --- Chapter 14: Analyzing Spatial Data with PostGIS +-- Chapter 15: Analyzing Spatial Data with PostGIS -------------------------------------------------------------- -- 1. Earlier, you found which US county has the largest area. Now, @@ -1004,7 +1022,14 @@ ORDER BY census.statefp10, census.name10; -- Can you spot it? -------------------------------------------------------------- --- Chapter 15: Saving Time with Views, Functions, and Triggers +-- Chapter 16: Working with JSON Data +-------------------------------------------------------------- + +-- To come ... + + +-------------------------------------------------------------- +-- Chapter 17: Saving Time with Views, Functions, and Triggers -------------------------------------------------------------- -- 1. Create a view that displays the number of New York City taxi trips per @@ -1085,14 +1110,14 @@ SELECT * FROM meat_poultry_egg_inspect WHERE company = 'testcompany'; -------------------------------------------------------------- --- Chapter 16: Using PostgreSQL From the Command Line +-- Chapter 18: Using PostgreSQL From the Command Line -------------------------------------------------------------- -- For this chapter, use psql to review any of the exercises in the book. -------------------------------------------------------------- --- Chapter 17: Maintaining Your Database +-- Chapter 19: Maintaining Your Database -------------------------------------------------------------- -- To back up the gis_analysis database, use the pg_dump utility at the command line: @@ -1100,7 +1125,7 @@ WHERE company = 'testcompany'; ----------------------------------------------------------------- --- Chapter 18: Identifying and Telling the Story Behind Your Data +-- Chapter 20: Identifying and Telling the Story Behind Your Data ----------------------------------------------------------------- -- This is a non-coding chapter.