From 0d2ffa4aadb9a51c50ce137b980fcdbd4052d617 Mon Sep 17 00:00:00 2001 From: Anthony DeBarros Date: Wed, 26 Feb 2020 21:43:21 -0500 Subject: [PATCH] More renumbering --- Chapter_02/Chapter_02.sql | 34 ++++ Chapter_03/Chapter_03.sql | 104 +++++++++++ Chapter_04/Chapter_04.sql | 90 ++++++++++ Chapter_05/Chapter_05.sql | 212 ++++++++++++++++++++++ Chapter_06/Chapter_06.sql | 207 ++++++++++++++++++++++ Chapter_07/Chapter_07.sql | 196 ++++++++++++++++++++ Chapter_08/Chapter_08.sql | 184 +++++++++++++++++++ Chapter_09/Chapter_09.sql | 299 +++++++++++++++++++++++++++++++ Chapter_10/Chapter_10.sql | 259 +++++++++++++++++++++++++++ Chapter_11/Chapter_11.sql | 167 ++++++++++++++++++ Chapter_12/Chapter_12.sql | 233 ++++++++++++++++++++++++ Chapter_13/Chapter_13.sql | 310 ++++++++++++++++++++++++++++++++ Chapter_14/Chapter_14.sql | 363 ++++++++++++++++++++++++++++++++++++++ Chapter_15/Chapter_15.sql | 252 ++++++++++++++++++++++++++ Chapter_17/Chapter_17.sql | 321 +++++++++++++++++++++++++++++++++ Chapter_19/Chapter_19.sql | 104 +++++++++++ 16 files changed, 3335 insertions(+) create mode 100644 Chapter_02/Chapter_02.sql create mode 100644 Chapter_03/Chapter_03.sql create mode 100644 Chapter_04/Chapter_04.sql create mode 100644 Chapter_05/Chapter_05.sql create mode 100644 Chapter_06/Chapter_06.sql create mode 100644 Chapter_07/Chapter_07.sql create mode 100644 Chapter_08/Chapter_08.sql create mode 100644 Chapter_09/Chapter_09.sql create mode 100644 Chapter_10/Chapter_10.sql create mode 100644 Chapter_11/Chapter_11.sql create mode 100644 Chapter_12/Chapter_12.sql create mode 100644 Chapter_13/Chapter_13.sql create mode 100644 Chapter_14/Chapter_14.sql create mode 100644 Chapter_15/Chapter_15.sql create mode 100644 Chapter_17/Chapter_17.sql create mode 100644 Chapter_19/Chapter_19.sql diff --git a/Chapter_02/Chapter_02.sql b/Chapter_02/Chapter_02.sql new file mode 100644 index 0000000..d5494f3 --- /dev/null +++ b/Chapter_02/Chapter_02.sql @@ -0,0 +1,34 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 1 Code Examples +-------------------------------------------------------------- + +-- Listing 1-1: Creating a database named analysis + +CREATE DATABASE analysis; + +-- Listing 1-2: Creating a table named teachers with six columns + +CREATE TABLE teachers ( + id bigserial, + first_name varchar(25), + last_name varchar(50), + school varchar(50), + hire_date date, + salary numeric +); + +-- This command will remove (drop) the table. +-- DROP TABLE teachers; + +-- Listing 1-3 Inserting data into the teachers table + +INSERT INTO teachers (first_name, last_name, school, hire_date, salary) +VALUES ('Janet', 'Smith', 'F.D. Roosevelt HS', '2011-10-30', 36200), + ('Lee', 'Reynolds', 'F.D. Roosevelt HS', '1993-05-22', 65000), + ('Samuel', 'Cole', 'Myers Middle School', '2005-08-01', 43500), + ('Samantha', 'Bush', 'Myers Middle School', '2011-10-30', 36200), + ('Betty', 'Diaz', 'Myers Middle School', '2005-08-30', 43500), + ('Kathleen', 'Roush', 'F.D. Roosevelt HS', '2010-10-22', 38500); diff --git a/Chapter_03/Chapter_03.sql b/Chapter_03/Chapter_03.sql new file mode 100644 index 0000000..ea76448 --- /dev/null +++ b/Chapter_03/Chapter_03.sql @@ -0,0 +1,104 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 2 Code Examples +-------------------------------------------------------------- + +-- Listing 2-1: Querying all rows and columns from the teachers table + +SELECT * FROM teachers; + +-- Listing 2-2: Querying a subset of columns + +SELECT last_name, first_name, salary FROM teachers; + +-- Listing 2-3: Querying distinct values in the school column + +SELECT DISTINCT school +FROM teachers; + +-- Listing 2-4: Querying distinct pairs of values in the school and salary +-- columns + +SELECT DISTINCT school, salary +FROM teachers; + +-- Listing 2-5: Sorting a column with ORDER BY + +SELECT first_name, last_name, salary +FROM teachers +ORDER BY salary DESC; + +-- Listing 2-6: Sorting multiple columns with ORDER BY + +SELECT last_name, school, hire_date +FROM teachers +ORDER BY school ASC, hire_date DESC; + +-- Listing 2-7: Filtering rows using WHERE + +SELECT last_name, school, hire_date +FROM teachers +WHERE school = 'Myers Middle School'; + +-- Examples of WHERE comparison operators + +-- Teachers with first name of Janet +SELECT first_name, last_name, school +FROM teachers +WHERE first_name = 'Janet'; + +-- School names not equal to F.D. Roosevelt HS +SELECT school +FROM teachers +WHERE school != 'F.D. Roosevelt HS'; + +-- Teachers hired before Jan. 1, 2000 +SELECT first_name, last_name, hire_date +FROM teachers +WHERE hire_date < '2000-01-01'; + +-- Teachers earning 43,500 or more +SELECT first_name, last_name, salary +FROM teachers +WHERE salary >= 43500; + +-- Teachers who earn between $40,000 and $65,000 +SELECT first_name, last_name, school, salary +FROM teachers +WHERE salary BETWEEN 40000 AND 65000; + +-- Listing 2-8: Filtering with LIKE AND ILIKE + +SELECT first_name +FROM teachers +WHERE first_name LIKE 'sam%'; + +SELECT first_name +FROM teachers +WHERE first_name ILIKE 'sam%'; + +-- Listing 2-9: Combining operators using AND and OR + +SELECT * +FROM teachers +WHERE school = 'Myers Middle School' + AND salary < 40000; + +SELECT * +FROM teachers +WHERE last_name = 'Cole' + OR last_name = 'Bush'; + +SELECT * +FROM teachers +WHERE school = 'F.D. Roosevelt HS' + AND (salary < 38000 OR salary > 40000); + +-- Listing 2-10: A SELECT statement including WHERE and ORDER BY + +SELECT first_name, last_name, school, hire_date, salary +FROM teachers +WHERE school LIKE '%Roos%' +ORDER BY hire_date DESC; diff --git a/Chapter_04/Chapter_04.sql b/Chapter_04/Chapter_04.sql new file mode 100644 index 0000000..716669c --- /dev/null +++ b/Chapter_04/Chapter_04.sql @@ -0,0 +1,90 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 3 Code Examples +-------------------------------------------------------------- + +-- Listing 3-1: Character data types in action + +CREATE TABLE char_data_types ( + varchar_column varchar(10), + char_column char(10), + text_column text +); + +INSERT INTO char_data_types +VALUES + ('abc', 'abc', 'abc'), + ('defghi', 'defghi', 'defghi'); + +COPY char_data_types TO 'C:\YourDirectory\typetest.txt' +WITH (FORMAT CSV, HEADER, DELIMITER '|'); + + +-- Listing 3-2: Number data types in action + +CREATE TABLE number_data_types ( + numeric_column numeric(20,5), + real_column real, + double_column double precision +); + +INSERT INTO number_data_types +VALUES + (.7, .7, .7), + (2.13579, 2.13579, 2.13579), + (2.1357987654, 2.1357987654, 2.1357987654); + +SELECT * FROM number_data_types; + +-- Listing 3-3: Rounding issues with float columns +-- Assumes table created and loaded with Listing 3-2 + +SELECT + numeric_column * 10000000 AS "Fixed", + real_column * 10000000 AS "Float" +FROM number_data_types +WHERE numeric_column = .7; + +-- Listing 3-4: Timestamp and interval types in action + +CREATE TABLE date_time_types ( + timestamp_column timestamp with time zone, + interval_column interval +); + +INSERT INTO date_time_types +VALUES + ('2018-12-31 01:00 EST','2 days'), + ('2018-12-31 01:00 PST','1 month'), + ('2018-12-31 01:00 Australia/Melbourne','1 century'), + (now(),'1 week'); + +SELECT * FROM date_time_types; + +-- Listing 3-5: Using the interval data type +-- Assumes script 3-4 has been run + +SELECT + timestamp_column, + interval_column, + timestamp_column - interval_column AS new_date +FROM date_time_types; + +-- Listing 3-6: Three CAST() examples + +SELECT timestamp_column, CAST(timestamp_column AS varchar(10)) +FROM date_time_types; + +SELECT numeric_column, + CAST(numeric_column AS integer), + CAST(numeric_column AS varchar(6)) +FROM number_data_types; + +-- Does not work: +SELECT CAST(char_column AS integer) FROM char_data_types; + +-- Alternate notation for CAST is the double-colon: +SELECT timestamp_column::varchar(10) +FROM date_time_types; diff --git a/Chapter_05/Chapter_05.sql b/Chapter_05/Chapter_05.sql new file mode 100644 index 0000000..b34888c --- /dev/null +++ b/Chapter_05/Chapter_05.sql @@ -0,0 +1,212 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 4 Code Examples +-------------------------------------------------------------- + +-- Listing 4-1: Using COPY for data import +-- This is example syntax only; running it will produce an error + +COPY table_name +FROM 'C:\YourDirectory\your_file.csv' +WITH (FORMAT CSV, HEADER); + + +-- Listing 4-2: A CREATE TABLE statement for Census county data +-- Full data dictionary available at: http://www.census.gov/prod/cen2010/doc/pl94-171.pdf +-- Note: Some columns have been given more descriptive names + +CREATE TABLE us_counties_2010 ( + geo_name varchar(90), -- Name of the geography + state_us_abbreviation varchar(2), -- State/U.S. abbreviation + summary_level varchar(3), -- Summary Level + region smallint, -- Region + division smallint, -- Division + state_fips varchar(2), -- State FIPS code + county_fips varchar(3), -- County code + area_land bigint, -- Area (Land) in square meters + area_water bigint, -- Area (Water) in square meters + population_count_100_percent integer, -- Population count (100%) + housing_unit_count_100_percent integer, -- Housing Unit count (100%) + internal_point_lat numeric(10,7), -- Internal point (latitude) + internal_point_lon numeric(10,7), -- Internal point (longitude) + + -- This section is referred to as P1. Race: + p0010001 integer, -- Total population + p0010002 integer, -- Population of one race: + p0010003 integer, -- White Alone + p0010004 integer, -- Black or African American alone + p0010005 integer, -- American Indian and Alaska Native alone + p0010006 integer, -- Asian alone + p0010007 integer, -- Native Hawaiian and Other Pacific Islander alone + p0010008 integer, -- Some Other Race alone + p0010009 integer, -- Population of two or more races + p0010010 integer, -- Population of two races: + p0010011 integer, -- White; Black or African American + p0010012 integer, -- White; American Indian and Alaska Native + p0010013 integer, -- White; Asian + p0010014 integer, -- White; Native Hawaiian and Other Pacific Islander + p0010015 integer, -- White; Some Other Race + p0010016 integer, -- Black or African American; American Indian and Alaska Native + p0010017 integer, -- Black or African American; Asian + p0010018 integer, -- Black or African American; Native Hawaiian and Other Pacific Islander + p0010019 integer, -- Black or African American; Some Other Race + p0010020 integer, -- American Indian and Alaska Native; Asian + p0010021 integer, -- American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander + p0010022 integer, -- American Indian and Alaska Native; Some Other Race + p0010023 integer, -- Asian; Native Hawaiian and Other Pacific Islander + p0010024 integer, -- Asian; Some Other Race + p0010025 integer, -- Native Hawaiian and Other Pacific Islander; Some Other Race + p0010026 integer, -- Population of three races + p0010047 integer, -- Population of four races + p0010063 integer, -- Population of five races + p0010070 integer, -- Population of six races + + -- This section is referred to as P2. HISPANIC OR LATINO, AND NOT HISPANIC OR LATINO BY RACE + p0020001 integer, -- Total + p0020002 integer, -- Hispanic or Latino + p0020003 integer, -- Not Hispanic or Latino: + p0020004 integer, -- Population of one race: + p0020005 integer, -- White Alone + p0020006 integer, -- Black or African American alone + p0020007 integer, -- American Indian and Alaska Native alone + p0020008 integer, -- Asian alone + p0020009 integer, -- Native Hawaiian and Other Pacific Islander alone + p0020010 integer, -- Some Other Race alone + p0020011 integer, -- Two or More Races + p0020012 integer, -- Population of two races + p0020028 integer, -- Population of three races + p0020049 integer, -- Population of four races + p0020065 integer, -- Population of five races + p0020072 integer, -- Population of six races + + -- This section is referred to as P3. RACE FOR THE POPULATION 18 YEARS AND OVER + p0030001 integer, -- Total + p0030002 integer, -- Population of one race: + p0030003 integer, -- White alone + p0030004 integer, -- Black or African American alone + p0030005 integer, -- American Indian and Alaska Native alone + p0030006 integer, -- Asian alone + p0030007 integer, -- Native Hawaiian and Other Pacific Islander alone + p0030008 integer, -- Some Other Race alone + p0030009 integer, -- Two or More Races + p0030010 integer, -- Population of two races + p0030026 integer, -- Population of three races + p0030047 integer, -- Population of four races + p0030063 integer, -- Population of five races + p0030070 integer, -- Population of six races + + -- This section is referred to as P4. HISPANIC OR LATINO, AND NOT HISPANIC OR LATINO BY RACE + -- FOR THE POPULATION 18 YEARS AND OVER + p0040001 integer, -- Total + p0040002 integer, -- Hispanic or Latino + p0040003 integer, -- Not Hispanic or Latino: + p0040004 integer, -- Population of one race: + p0040005 integer, -- White alone + p0040006 integer, -- Black or African American alone + p0040007 integer, -- American Indian and Alaska Native alone + p0040008 integer, -- Asian alone + p0040009 integer, -- Native Hawaiian and Other Pacific Islander alone + p0040010 integer, -- Some Other Race alone + p0040011 integer, -- Two or More Races + p0040012 integer, -- Population of two races + p0040028 integer, -- Population of three races + p0040049 integer, -- Population of four races + p0040065 integer, -- Population of five races + p0040072 integer, -- Population of six races + + -- This section is referred to as H1. OCCUPANCY STATUS + h0010001 integer, -- Total housing units + h0010002 integer, -- Occupied + h0010003 integer -- Vacant +); + +SELECT * FROM us_counties_2010; + +-- Listing 4-3: Importing Census data using COPY +-- Note! If you run into an import error here, be sure you downloaded the code and +-- data for the book according to the steps listed on page xxvii in the Introduction. +-- Windows users: Please check the Note on page xxvii as well. + +COPY us_counties_2010 +FROM 'C:\YourDirectory\us_counties_2010.csv' +WITH (FORMAT CSV, HEADER); + +-- Checking the data + +SELECT * FROM us_counties_2010; + +SELECT geo_name, state_us_abbreviation, area_land +FROM us_counties_2010 +ORDER BY area_land DESC +LIMIT 3; + +SELECT geo_name, state_us_abbreviation, internal_point_lon +FROM us_counties_2010 +ORDER BY internal_point_lon DESC +LIMIT 5; + + +-- Listing 4-4: Creating a table to track supervisor salaries + +CREATE TABLE supervisor_salaries ( + town varchar(30), + county varchar(30), + supervisor varchar(30), + start_date date, + salary money, + benefits money +); + +-- Listing 4-5: Importing salaries data from CSV to three table columns + +COPY supervisor_salaries (town, supervisor, salary) +FROM 'C:\YourDirectory\supervisor_salaries.csv' +WITH (FORMAT CSV, HEADER); + +-- Check the data +SELECT * FROM supervisor_salaries LIMIT 2; + +-- Listing 4-6 Use a temporary table to add a default value to a column during +-- import + +DELETE FROM supervisor_salaries; + +CREATE TEMPORARY TABLE supervisor_salaries_temp (LIKE supervisor_salaries); + +COPY supervisor_salaries_temp (town, supervisor, salary) +FROM 'C:\YourDirectory\supervisor_salaries.csv' +WITH (FORMAT CSV, HEADER); + +INSERT INTO supervisor_salaries (town, county, supervisor, salary) +SELECT town, 'Some County', supervisor, salary +FROM supervisor_salaries_temp; + +DROP TABLE supervisor_salaries_temp; + +-- Check the data +SELECT * FROM supervisor_salaries LIMIT 2; + +-- Listing 4-7: Export an entire table with COPY + +COPY us_counties_2010 +TO 'C:\YourDirectory\us_counties_export.txt' +WITH (FORMAT CSV, HEADER, DELIMITER '|'); + + +-- Listing 4-8: Exporting selected columns from a table with COPY + +COPY us_counties_2010 (geo_name, internal_point_lat, internal_point_lon) +TO 'C:\YourDirectory\us_counties_latlon_export.txt' +WITH (FORMAT CSV, HEADER, DELIMITER '|'); + +-- Listing 4-9: Exporting query results with COPY + +COPY ( + SELECT geo_name, state_us_abbreviation + FROM us_counties_2010 + WHERE geo_name ILIKE '%mill%' + ) +TO 'C:\YourDirectory\us_counties_mill_export.txt' +WITH (FORMAT CSV, HEADER, DELIMITER '|'); diff --git a/Chapter_06/Chapter_06.sql b/Chapter_06/Chapter_06.sql new file mode 100644 index 0000000..dacc76f --- /dev/null +++ b/Chapter_06/Chapter_06.sql @@ -0,0 +1,207 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 5 Code Examples +-------------------------------------------------------------- + +-- Listing 5-1: Basic addition, subtraction and multiplication with SQL + +SELECT 2 + 2; -- addition +SELECT 9 - 1; -- subtraction +SELECT 3 * 4; -- multiplication + +-- Listing 5-2: Integer and decimal division with SQL + +SELECT 11 / 6; -- integer division +SELECT 11 % 6; -- modulo division +SELECT 11.0 / 6; -- decimal division +SELECT CAST(11 AS numeric(3,1)) / 6; + +-- Listing 5-3: Exponents, roots and factorials with SQL + +SELECT 3 ^ 4; -- exponentiation +SELECT |/ 10; -- square root (operator) +SELECT sqrt(10); -- square root (function) +SELECT ||/ 10; -- cube root +SELECT 4 !; -- factorial + +-- Order of operations + +SELECT 7 + 8 * 9; -- answer: 79 +SELECT (7 + 8) * 9; -- answer: 135 + +SELECT 3 ^ 3 - 1; -- answer: 26 +SELECT 3 ^ (3 - 1); -- answer: 9 + +-- Listing 5-4: Selecting Census population columns by race with aliases + +SELECT geo_name, + state_us_abbreviation AS "st", + p0010001 AS "Total Population", + p0010003 AS "White Alone", + p0010004 AS "Black or African American Alone", + p0010005 AS "Am Indian/Alaska Native Alone", + p0010006 AS "Asian Alone", + p0010007 AS "Native Hawaiian and Other Pacific Islander Alone", + p0010008 AS "Some Other Race Alone", + p0010009 AS "Two or More Races" +FROM us_counties_2010; + +-- Listing 5-5: Adding two columns in us_counties_2010 + +SELECT geo_name, + state_us_abbreviation AS "st", + p0010003 AS "White Alone", + p0010004 AS "Black Alone", + p0010003 + p0010004 AS "Total White and Black" +FROM us_counties_2010; + +-- Listing 5-6: Checking Census data totals + +SELECT geo_name, + state_us_abbreviation AS "st", + p0010001 AS "Total", + p0010003 + p0010004 + p0010005 + p0010006 + p0010007 + + p0010008 + p0010009 AS "All Races", + (p0010003 + p0010004 + p0010005 + p0010006 + p0010007 + + p0010008 + p0010009) - p0010001 AS "Difference" +FROM us_counties_2010 +ORDER BY "Difference" DESC; + +-- Listing 5-7: Calculating the percent of the population that is +-- Asian by county (percent of the whole) + +SELECT geo_name, + state_us_abbreviation AS "st", + (CAST(p0010006 AS numeric(8,1)) / p0010001) * 100 AS "pct_asian" +FROM us_counties_2010 +ORDER BY "pct_asian" DESC; + +-- Listing 5-8: Calculating percent change + +CREATE TABLE percent_change ( + department varchar(20), + spend_2014 numeric(10,2), + spend_2017 numeric(10,2) +); + +INSERT INTO percent_change +VALUES + ('Building', 250000, 289000), + ('Assessor', 178556, 179500), + ('Library', 87777, 90001), + ('Clerk', 451980, 650000), + ('Police', 250000, 223000), + ('Recreation', 199000, 195000); + +SELECT department, + spend_2014, + spend_2017, + round( (spend_2017 - spend_2014) / + spend_2014 * 100, 1 ) AS "pct_change" +FROM percent_change; + +-- Listing 5-9: Using sum() and avg() aggregate functions + +SELECT sum(p0010001) AS "County Sum", + round(avg(p0010001), 0) AS "County Average" +FROM us_counties_2010; + +-- Listing 5-10: Testing SQL percentile functions + +CREATE TABLE percentile_test ( + numbers integer +); + +INSERT INTO percentile_test (numbers) VALUES + (1), (2), (3), (4), (5), (6); + +SELECT + percentile_cont(.5) + WITHIN GROUP (ORDER BY numbers), + percentile_disc(.5) + WITHIN GROUP (ORDER BY numbers) +FROM percentile_test; + +-- Listing 5-11: Using sum(), avg(), and percentile_cont() aggregate functions + +SELECT sum(p0010001) AS "County Sum", + round(avg(p0010001), 0) AS "County Average", + percentile_cont(.5) + WITHIN GROUP (ORDER BY p0010001) AS "County Median" +FROM us_counties_2010; + +-- Listing 5-12: Passing an array of values to percentile_cont() + +-- quartiles +SELECT percentile_cont(array[.25,.5,.75]) + WITHIN GROUP (ORDER BY p0010001) AS "quartiles" +FROM us_counties_2010; + +-- Extra: +-- quintiles +SELECT percentile_cont(array[.2,.4,.6,.8]) + WITHIN GROUP (ORDER BY p0010001) AS "quintiles" +FROM us_counties_2010; + +-- deciles +SELECT percentile_cont(array[.1,.2,.3,.4,.5,.6,.7,.8,.9]) + WITHIN GROUP (ORDER BY p0010001) AS "deciles" +FROM us_counties_2010; + +-- Listing 5-13: Using unnest() to turn an array into rows + +SELECT unnest( + percentile_cont(array[.25,.5,.75]) + WITHIN GROUP (ORDER BY p0010001) + ) AS "quartiles" +FROM us_counties_2010; + +-- Listing 5-14: Creating a median() aggregate function in PostgreSQL +-- Source: https://wiki.postgresql.org/wiki/Aggregate_Median + +CREATE OR REPLACE FUNCTION _final_median(anyarray) + RETURNS float8 AS +$$ + WITH q AS + ( + SELECT val + FROM unnest($1) val + WHERE VAL IS NOT NULL + ORDER BY 1 + ), + cnt AS + ( + SELECT COUNT(*) AS c FROM q + ) + SELECT AVG(val)::float8 + FROM + ( + SELECT val FROM q + LIMIT 2 - MOD((SELECT c FROM cnt), 2) + OFFSET GREATEST(CEIL((SELECT c FROM cnt) / 2.0) - 1,0) + ) q2; +$$ +LANGUAGE sql IMMUTABLE; + +CREATE AGGREGATE median(anyelement) ( + SFUNC=array_append, + STYPE=anyarray, + FINALFUNC=_final_median, + INITCOND='{}' +); + +-- Listing 5-15: Using a median() aggregate function + +SELECT sum(p0010001) AS "County Sum", + round(avg(p0010001), 0) AS "County Average", + median(p0010001) AS "County Median", + percentile_cont(.5) + WITHIN GROUP (ORDER BY P0010001) AS "50th Percentile" +FROM us_counties_2010; + +-- Listing 5-16: Finding the most-frequent value with mode() + +SELECT mode() WITHIN GROUP (ORDER BY p0010001) +FROM us_counties_2010; diff --git a/Chapter_07/Chapter_07.sql b/Chapter_07/Chapter_07.sql new file mode 100644 index 0000000..b8130da --- /dev/null +++ b/Chapter_07/Chapter_07.sql @@ -0,0 +1,196 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 6 Code Examples +-------------------------------------------------------------- + +-- Listing 6-1: Creating the departments and employees tables + +CREATE TABLE departments ( + dept_id bigserial, + dept varchar(100), + city varchar(100), + CONSTRAINT dept_key PRIMARY KEY (dept_id), + CONSTRAINT dept_city_unique UNIQUE (dept, city) +); + +CREATE TABLE employees ( + emp_id bigserial, + first_name varchar(100), + last_name varchar(100), + salary integer, + dept_id integer REFERENCES departments (dept_id), + CONSTRAINT emp_key PRIMARY KEY (emp_id), + CONSTRAINT emp_dept_unique UNIQUE (emp_id, dept_id) +); + +INSERT INTO departments (dept, city) +VALUES + ('Tax', 'Atlanta'), + ('IT', 'Boston'); + +INSERT INTO employees (first_name, last_name, salary, dept_id) +VALUES + ('Nancy', 'Jones', 62500, 1), + ('Lee', 'Smith', 59300, 1), + ('Soo', 'Nguyen', 83000, 2), + ('Janet', 'King', 95000, 2); + +-- Listing 6-2: Joining the employees and departments tables + +SELECT * +FROM employees JOIN departments +ON employees.dept_id = departments.dept_id; + +-- Listing 6-3: Creating two tables to explore JOIN types + +CREATE TABLE schools_left ( + id integer CONSTRAINT left_id_key PRIMARY KEY, + left_school varchar(30) +); + +CREATE TABLE schools_right ( + id integer CONSTRAINT right_id_key PRIMARY KEY, + right_school varchar(30) +); + +INSERT INTO schools_left (id, left_school) VALUES + (1, 'Oak Street School'), + (2, 'Roosevelt High School'), + (5, 'Washington Middle School'), + (6, 'Jefferson High School'); + +INSERT INTO schools_right (id, right_school) VALUES + (1, 'Oak Street School'), + (2, 'Roosevelt High School'), + (3, 'Morrison Elementary'), + (4, 'Chase Magnet Academy'), + (6, 'Jefferson High School'); + +-- Listing 6-4: Using JOIN + +SELECT * +FROM schools_left JOIN schools_right +ON schools_left.id = schools_right.id; + +-- Bonus: Also can be specified as INNER JOIN + +SELECT * +FROM schools_left INNER JOIN schools_right +ON schools_left.id = schools_right.id; + +-- Listing 6-5: Using LEFT JOIN + +SELECT * +FROM schools_left LEFT JOIN schools_right +ON schools_left.id = schools_right.id; + +-- Listing 6-6: Using RIGHT JOIN + +SELECT * +FROM schools_left RIGHT JOIN schools_right +ON schools_left.id = schools_right.id; + +-- Listing 6-7: Using FULL OUTER JOIN + +SELECT * +FROM schools_left FULL OUTER JOIN schools_right +ON schools_left.id = schools_right.id; + +-- Listing 6-8: Using CROSS JOIN + +SELECT * +FROM schools_left CROSS JOIN schools_right; + +-- Listing 6-9: Filtering to show missing values with IS NULL + +SELECT * +FROM schools_left LEFT JOIN schools_right +ON schools_left.id = schools_right.id +WHERE schools_right.id IS NULL; + +-- Listing 6-10: Querying specific columns in a join +SELECT schools_left.id, + schools_left.left_school, + schools_right.right_school +FROM schools_left LEFT JOIN schools_right +ON schools_left.id = schools_right.id; + +-- Listing 6-11: Simplifying code with table aliases +SELECT lt.id, + lt.left_school, + rt.right_school +FROM schools_left AS lt LEFT JOIN schools_right AS rt +ON lt.id = rt.id; + +-- Listing 6-12: Joining multiple tables +CREATE TABLE schools_enrollment ( + id integer, + enrollment integer +); + +CREATE TABLE schools_grades ( + id integer, + grades varchar(10) +); + +INSERT INTO schools_enrollment (id, enrollment) +VALUES + (1, 360), + (2, 1001), + (5, 450), + (6, 927); + +INSERT INTO schools_grades (id, grades) +VALUES + (1, 'K-3'), + (2, '9-12'), + (5, '6-8'), + (6, '9-12'); + +SELECT lt.id, lt.left_school, en.enrollment, gr.grades +FROM schools_left AS lt LEFT JOIN schools_enrollment AS en + ON lt.id = en.id +LEFT JOIN schools_grades AS gr + ON lt.id = gr.id; + +-- Listing 6-13: Performing math on joined Census tables +-- Decennial Census 2000. Full data dictionary at https://www.census.gov/prod/cen2000/doc/pl94-171.pdf +-- Note: Some non-number columns have been given more descriptive names + +CREATE TABLE us_counties_2000 ( + geo_name varchar(90), -- County/state name, + state_us_abbreviation varchar(2), -- State/U.S. abbreviation + state_fips varchar(2), -- State FIPS code + county_fips varchar(3), -- County code + p0010001 integer, -- Total population + p0010002 integer, -- Population of one race: + p0010003 integer, -- White Alone + p0010004 integer, -- Black or African American alone + p0010005 integer, -- American Indian and Alaska Native alone + p0010006 integer, -- Asian alone + p0010007 integer, -- Native Hawaiian and Other Pacific Islander alone + p0010008 integer, -- Some Other Race alone + p0010009 integer, -- Population of two or more races + p0010010 integer, -- Population of two races + p0020002 integer, -- Hispanic or Latino + p0020003 integer -- Not Hispanic or Latino: +); + +COPY us_counties_2000 +FROM 'C:\YourDirectory\us_counties_2000.csv' +WITH (FORMAT CSV, HEADER); + +SELECT c2010.geo_name, + c2010.state_us_abbreviation AS state, + c2010.p0010001 AS pop_2010, + c2000.p0010001 AS pop_2000, + c2010.p0010001 - c2000.p0010001 AS raw_change, + round( (CAST(c2010.p0010001 AS numeric(8,1)) - c2000.p0010001) + / c2000.p0010001 * 100, 1 ) AS pct_change +FROM us_counties_2010 c2010 INNER JOIN us_counties_2000 c2000 +ON c2010.state_fips = c2000.state_fips + AND c2010.county_fips = c2000.county_fips + AND c2010.p0010001 <> c2000.p0010001 +ORDER BY pct_change DESC; diff --git a/Chapter_08/Chapter_08.sql b/Chapter_08/Chapter_08.sql new file mode 100644 index 0000000..3cc6cac --- /dev/null +++ b/Chapter_08/Chapter_08.sql @@ -0,0 +1,184 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 7 Code Examples +-------------------------------------------------------------- + +-- Listing 7-1: Declaring a single-column natural key as primary key + +-- As a column constraint +CREATE TABLE natural_key_example ( + license_id varchar(10) CONSTRAINT license_key PRIMARY KEY, + first_name varchar(50), + last_name varchar(50) +); + +-- Drop the table before trying again +DROP TABLE natural_key_example; + +-- As a table constraint +CREATE TABLE natural_key_example ( + license_id varchar(10), + first_name varchar(50), + last_name varchar(50), + CONSTRAINT license_key PRIMARY KEY (license_id) +); + +-- Listing 7-2: Example of a primary key violation +INSERT INTO natural_key_example (license_id, first_name, last_name) +VALUES ('T229901', 'Lynn', 'Malero'); + +INSERT INTO natural_key_example (license_id, first_name, last_name) +VALUES ('T229901', 'Sam', 'Tracy'); + +-- Listing 7-3: Declaring a composite primary key as a natural key +CREATE TABLE natural_key_composite_example ( + student_id varchar(10), + school_day date, + present boolean, + CONSTRAINT student_key PRIMARY KEY (student_id, school_day) +); + +-- Listing 7-4: Example of a composite primary key violation + +INSERT INTO natural_key_composite_example (student_id, school_day, present) +VALUES(775, '1/22/2017', 'Y'); + +INSERT INTO natural_key_composite_example (student_id, school_day, present) +VALUES(775, '1/23/2017', 'Y'); + +INSERT INTO natural_key_composite_example (student_id, school_day, present) +VALUES(775, '1/23/2017', 'N'); + +-- Listing 7-5: Declaring a bigserial column as a surrogate key + +CREATE TABLE surrogate_key_example ( + order_number bigserial, + product_name varchar(50), + order_date date, + CONSTRAINT order_key PRIMARY KEY (order_number) +); + +INSERT INTO surrogate_key_example (product_name, order_date) +VALUES ('Beachball Polish', '2015-03-17'), + ('Wrinkle De-Atomizer', '2017-05-22'), + ('Flux Capacitor', '1985-10-26'); + +SELECT * FROM surrogate_key_example; + +-- Listing 7-6: A foreign key example + +CREATE TABLE licenses ( + license_id varchar(10), + first_name varchar(50), + last_name varchar(50), + CONSTRAINT licenses_key PRIMARY KEY (license_id) +); + +CREATE TABLE registrations ( + registration_id varchar(10), + registration_date date, + license_id varchar(10) REFERENCES licenses (license_id), + CONSTRAINT registration_key PRIMARY KEY (registration_id, license_id) +); + +INSERT INTO licenses (license_id, first_name, last_name) +VALUES ('T229901', 'Lynn', 'Malero'); + +INSERT INTO registrations (registration_id, registration_date, license_id) +VALUES ('A203391', '3/17/2017', 'T229901'); + +INSERT INTO registrations (registration_id, registration_date, license_id) +VALUES ('A75772', '3/17/2017', 'T000001'); + +-- Listing 7-7: CHECK constraint examples + +CREATE TABLE check_constraint_example ( + user_id bigserial, + user_role varchar(50), + salary integer, + CONSTRAINT user_id_key PRIMARY KEY (user_id), + CONSTRAINT check_role_in_list CHECK (user_role IN('Admin', 'Staff')), + CONSTRAINT check_salary_not_zero CHECK (salary > 0) +); + +-- Both of these will fail: +INSERT INTO check_constraint_example (user_role) +VALUES ('admin'); + +INSERT INTO check_constraint_example (salary) +VALUES (0); + +-- Listing 7-8: UNIQUE constraint example + +CREATE TABLE unique_constraint_example ( + contact_id bigserial CONSTRAINT contact_id_key PRIMARY KEY, + first_name varchar(50), + last_name varchar(50), + email varchar(200), + CONSTRAINT email_unique UNIQUE (email) +); + +INSERT INTO unique_constraint_example (first_name, last_name, email) +VALUES ('Samantha', 'Lee', 'slee@example.org'); + +INSERT INTO unique_constraint_example (first_name, last_name, email) +VALUES ('Betty', 'Diaz', 'bdiaz@example.org'); + +INSERT INTO unique_constraint_example (first_name, last_name, email) +VALUES ('Sasha', 'Lee', 'slee@example.org'); + +-- Listing 7-9: NOT NULL constraint example + +CREATE TABLE not_null_example ( + student_id bigserial, + first_name varchar(50) NOT NULL, + last_name varchar(50) NOT NULL, + CONSTRAINT student_id_key PRIMARY KEY (student_id) +); + +-- Listing 7-10: Dropping and adding a primary key and a NOT NULL constraint + +-- Drop +ALTER TABLE not_null_example DROP CONSTRAINT student_id_key; + +-- Add +ALTER TABLE not_null_example ADD CONSTRAINT student_id_key PRIMARY KEY (student_id); + +-- Drop +ALTER TABLE not_null_example ALTER COLUMN first_name DROP NOT NULL; + +-- Add +ALTER TABLE not_null_example ALTER COLUMN first_name SET NOT NULL; + +-- Listing 7-11: Importing New York City address data + +CREATE TABLE new_york_addresses ( + longitude numeric(9,6), + latitude numeric(9,6), + street_number varchar(10), + street varchar(32), + unit varchar(7), + postcode varchar(5), + id integer CONSTRAINT new_york_key PRIMARY KEY +); + +COPY new_york_addresses +FROM 'C:\YourDirectory\city_of_new_york.csv' +WITH (FORMAT CSV, HEADER); + +-- Listing 7-12: Benchmark queries for index performance + +EXPLAIN ANALYZE SELECT * FROM new_york_addresses +WHERE street = 'BROADWAY'; + +EXPLAIN ANALYZE SELECT * FROM new_york_addresses +WHERE street = '52 STREET'; + +EXPLAIN ANALYZE SELECT * FROM new_york_addresses +WHERE street = 'ZWICKY AVENUE'; + +-- Listing 7-13: Creating a B-Tree index on the new_york_addresses table + +CREATE INDEX street_idx ON new_york_addresses (street); diff --git a/Chapter_09/Chapter_09.sql b/Chapter_09/Chapter_09.sql new file mode 100644 index 0000000..e44dc35 --- /dev/null +++ b/Chapter_09/Chapter_09.sql @@ -0,0 +1,299 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 8 Code Examples +-------------------------------------------------------------- + +-- Listing 8-1: Creating and filling the 2014 Public Libraries Survey table + +CREATE TABLE pls_fy2014_pupld14a ( + stabr varchar(2) NOT NULL, + fscskey varchar(6) CONSTRAINT fscskey2014_key PRIMARY KEY, + libid varchar(20) NOT NULL, + libname varchar(100) NOT NULL, + obereg varchar(2) NOT NULL, + rstatus integer NOT NULL, + statstru varchar(2) NOT NULL, + statname varchar(2) NOT NULL, + stataddr varchar(2) NOT NULL, + longitud numeric(10,7) NOT NULL, + latitude numeric(10,7) NOT NULL, + fipsst varchar(2) NOT NULL, + fipsco varchar(3) NOT NULL, + address varchar(35) NOT NULL, + city varchar(20) NOT NULL, + zip varchar(5) NOT NULL, + zip4 varchar(4) NOT NULL, + cnty varchar(20) NOT NULL, + phone varchar(10) NOT NULL, + c_relatn varchar(2) NOT NULL, + c_legbas varchar(2) NOT NULL, + c_admin varchar(2) NOT NULL, + geocode varchar(3) NOT NULL, + lsabound varchar(1) NOT NULL, + startdat varchar(10), + enddate varchar(10), + popu_lsa integer NOT NULL, + centlib integer NOT NULL, + branlib integer NOT NULL, + bkmob integer NOT NULL, + master numeric(8,2) NOT NULL, + libraria numeric(8,2) NOT NULL, + totstaff numeric(8,2) NOT NULL, + locgvt integer NOT NULL, + stgvt integer NOT NULL, + fedgvt integer NOT NULL, + totincm integer NOT NULL, + salaries integer, + benefit integer, + staffexp integer, + prmatexp integer NOT NULL, + elmatexp integer NOT NULL, + totexpco integer NOT NULL, + totopexp integer NOT NULL, + lcap_rev integer NOT NULL, + scap_rev integer NOT NULL, + fcap_rev integer NOT NULL, + cap_rev integer NOT NULL, + capital integer NOT NULL, + bkvol integer NOT NULL, + ebook integer NOT NULL, + audio_ph integer NOT NULL, + audio_dl float NOT NULL, + video_ph integer NOT NULL, + video_dl float NOT NULL, + databases integer NOT NULL, + subscrip integer NOT NULL, + hrs_open integer NOT NULL, + visits integer NOT NULL, + referenc integer NOT NULL, + regbor integer NOT NULL, + totcir integer NOT NULL, + kidcircl integer NOT NULL, + elmatcir integer NOT NULL, + loanto integer NOT NULL, + loanfm integer NOT NULL, + totpro integer NOT NULL, + totatten integer NOT NULL, + gpterms integer NOT NULL, + pitusr integer NOT NULL, + wifisess integer NOT NULL, + yr_sub integer NOT NULL +); + +CREATE INDEX libname2014_idx ON pls_fy2014_pupld14a (libname); +CREATE INDEX stabr2014_idx ON pls_fy2014_pupld14a (stabr); +CREATE INDEX city2014_idx ON pls_fy2014_pupld14a (city); +CREATE INDEX visits2014_idx ON pls_fy2014_pupld14a (visits); + +COPY pls_fy2014_pupld14a +FROM 'C:\YourDirectory\pls_fy2014_pupld14a.csv' +WITH (FORMAT CSV, HEADER); + +-- Listing 8-2: Creating and filling the 2009 Public Libraries Survey table + +CREATE TABLE pls_fy2009_pupld09a ( + stabr varchar(2) NOT NULL, + fscskey varchar(6) CONSTRAINT fscskey2009_key PRIMARY KEY, + libid varchar(20) NOT NULL, + libname varchar(100) NOT NULL, + address varchar(35) NOT NULL, + city varchar(20) NOT NULL, + zip varchar(5) NOT NULL, + zip4 varchar(4) NOT NULL, + cnty varchar(20) NOT NULL, + phone varchar(10) NOT NULL, + c_relatn varchar(2) NOT NULL, + c_legbas varchar(2) NOT NULL, + c_admin varchar(2) NOT NULL, + geocode varchar(3) NOT NULL, + lsabound varchar(1) NOT NULL, + startdat varchar(10), + enddate varchar(10), + popu_lsa integer NOT NULL, + centlib integer NOT NULL, + branlib integer NOT NULL, + bkmob integer NOT NULL, + master numeric(8,2) NOT NULL, + libraria numeric(8,2) NOT NULL, + totstaff numeric(8,2) NOT NULL, + locgvt integer NOT NULL, + stgvt integer NOT NULL, + fedgvt integer NOT NULL, + totincm integer NOT NULL, + salaries integer, + benefit integer, + staffexp integer, + prmatexp integer NOT NULL, + elmatexp integer NOT NULL, + totexpco integer NOT NULL, + totopexp integer NOT NULL, + lcap_rev integer NOT NULL, + scap_rev integer NOT NULL, + fcap_rev integer NOT NULL, + cap_rev integer NOT NULL, + capital integer NOT NULL, + bkvol integer NOT NULL, + ebook integer NOT NULL, + audio integer NOT NULL, + video integer NOT NULL, + databases integer NOT NULL, + subscrip integer NOT NULL, + hrs_open integer NOT NULL, + visits integer NOT NULL, + referenc integer NOT NULL, + regbor integer NOT NULL, + totcir integer NOT NULL, + kidcircl integer NOT NULL, + loanto integer NOT NULL, + loanfm integer NOT NULL, + totpro integer NOT NULL, + totatten integer NOT NULL, + gpterms integer NOT NULL, + pitusr integer NOT NULL, + yr_sub integer NOT NULL, + obereg varchar(2) NOT NULL, + rstatus integer NOT NULL, + statstru varchar(2) NOT NULL, + statname varchar(2) NOT NULL, + stataddr varchar(2) NOT NULL, + longitud numeric(10,7) NOT NULL, + latitude numeric(10,7) NOT NULL, + fipsst varchar(2) NOT NULL, + fipsco varchar(3) NOT NULL +); + +CREATE INDEX libname2009_idx ON pls_fy2009_pupld09a (libname); +CREATE INDEX stabr2009_idx ON pls_fy2009_pupld09a (stabr); +CREATE INDEX city2009_idx ON pls_fy2009_pupld09a (city); +CREATE INDEX visits2009_idx ON pls_fy2009_pupld09a (visits); + +COPY pls_fy2009_pupld09a +FROM 'C:\YourDirectory\pls_fy2009_pupld09a.csv' +WITH (FORMAT CSV, HEADER); + +-- Listing 8-3: Using count() for table row counts + +SELECT count(*) +FROM pls_fy2014_pupld14a; + +SELECT count(*) +FROM pls_fy2009_pupld09a; + +-- Listing 8-4: Using count() for the number of values in a column + +SELECT count(salaries) +FROM pls_fy2014_pupld14a; + +-- Listing 8-5: Using count() for the number of distinct values in a column + +SELECT count(libname) +FROM pls_fy2014_pupld14a; + +SELECT count(DISTINCT libname) +FROM pls_fy2014_pupld14a; + +-- Bonus: find duplicate libnames +SELECT libname, count(libname) +FROM pls_fy2014_pupld14a +GROUP BY libname +ORDER BY count(libname) DESC; + +-- Bonus: see location of every Oxford Public Library +SELECT libname, city, stabr +FROM pls_fy2014_pupld14a +WHERE libname = 'OXFORD PUBLIC LIBRARY'; + +-- Listing 8-6: Finding the most and fewest visits using max() and min() +SELECT max(visits), min(visits) +FROM pls_fy2014_pupld14a; + +-- Listing 8-7: Using GROUP BY on the stabr column + +-- There are 56 in 2014. +SELECT stabr +FROM pls_fy2014_pupld14a +GROUP BY stabr +ORDER BY stabr; + +-- Bonus: there are 55 in 2009. +SELECT stabr +FROM pls_fy2009_pupld09a +GROUP BY stabr +ORDER BY stabr; + +-- Listing 8-8: Using GROUP BY on the city and stabr columns + +SELECT city, stabr +FROM pls_fy2014_pupld14a +GROUP BY city, stabr +ORDER BY city, stabr; + +-- Bonus: We can count some of the combos +SELECT city, stabr, count(*) +FROM pls_fy2014_pupld14a +GROUP BY city, stabr +ORDER BY count(*) DESC; + +-- Listing 8-9: GROUP BY with count() on the stabr column + +SELECT stabr, count(*) +FROM pls_fy2014_pupld14a +GROUP BY stabr +ORDER BY count(*) DESC; + +-- Listing 8-10: GROUP BY with count() on the stabr and stataddr columns + +SELECT stabr, stataddr, count(*) +FROM pls_fy2014_pupld14a +GROUP BY stabr, stataddr +ORDER BY stabr ASC, count(*) DESC; + +-- Listing 8-11: Using the sum() aggregate function to total visits to +-- libraries in 2014 and 2009 + +-- 2014 +SELECT sum(visits) AS visits_2014 +FROM pls_fy2014_pupld14a +WHERE visits >= 0; + +-- 2009 +SELECT sum(visits) AS visits_2009 +FROM pls_fy2009_pupld09a +WHERE visits >= 0; + +-- Listing 8-12: Using sum() to total visits on joined 2014 and 2009 library tables + +SELECT sum(pls14.visits) AS visits_2014, + sum(pls09.visits) AS visits_2009 +FROM pls_fy2014_pupld14a pls14 JOIN pls_fy2009_pupld09a pls09 +ON pls14.fscskey = pls09.fscskey +WHERE pls14.visits >= 0 AND pls09.visits >= 0; + +-- Listing 8-13: Using GROUP BY to track percent change in library visits by state + +SELECT pls14.stabr, + sum(pls14.visits) AS visits_2014, + sum(pls09.visits) AS visits_2009, + round( (CAST(sum(pls14.visits) AS decimal(10,1)) - sum(pls09.visits)) / + sum(pls09.visits) * 100, 2 ) AS pct_change +FROM pls_fy2014_pupld14a pls14 JOIN pls_fy2009_pupld09a pls09 +ON pls14.fscskey = pls09.fscskey +WHERE pls14.visits >= 0 AND pls09.visits >= 0 +GROUP BY pls14.stabr +ORDER BY pct_change DESC; + +-- Listing 8-14: Using HAVING to filter the results of an aggregate query + +SELECT pls14.stabr, + sum(pls14.visits) AS visits_2014, + sum(pls09.visits) AS visits_2009, + round( (CAST(sum(pls14.visits) AS decimal(10,1)) - sum(pls09.visits)) / + sum(pls09.visits) * 100, 2 ) AS pct_change +FROM pls_fy2014_pupld14a pls14 JOIN pls_fy2009_pupld09a pls09 +ON pls14.fscskey = pls09.fscskey +WHERE pls14.visits >= 0 AND pls09.visits >= 0 +GROUP BY pls14.stabr +HAVING sum(pls14.visits) > 50000000 +ORDER BY pct_change DESC; diff --git a/Chapter_10/Chapter_10.sql b/Chapter_10/Chapter_10.sql new file mode 100644 index 0000000..d4bfe6b --- /dev/null +++ b/Chapter_10/Chapter_10.sql @@ -0,0 +1,259 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 9 Code Examples +-------------------------------------------------------------- + +-- Listing 9-1: Importing the FSIS Meat, Poultry, and Egg Inspection Directory +-- https://catalog.data.gov/dataset/meat-poultry-and-egg-inspection-directory-by-establishment-name + +CREATE TABLE meat_poultry_egg_inspect ( + est_number varchar(50) CONSTRAINT est_number_key PRIMARY KEY, + company varchar(100), + street varchar(100), + city varchar(30), + st varchar(2), + zip varchar(5), + phone varchar(14), + grant_date date, + activities text, + dbas text +); + +COPY meat_poultry_egg_inspect +FROM 'C:\YourDirectory\MPI_Directory_by_Establishment_Name.csv' +WITH (FORMAT CSV, HEADER, DELIMITER ','); + +CREATE INDEX company_idx ON meat_poultry_egg_inspect (company); + +-- Count the rows imported: +SELECT count(*) FROM meat_poultry_egg_inspect; + +-- Listing 9-2: Finding multiple companies at the same address +SELECT company, + street, + city, + st, + count(*) AS address_count +FROM meat_poultry_egg_inspect +GROUP BY company, street, city, st +HAVING count(*) > 1 +ORDER BY company, street, city, st; + +-- Listing 9-3: Grouping and counting states +SELECT st, + count(*) AS st_count +FROM meat_poultry_egg_inspect +GROUP BY st +ORDER BY st; + +-- Listing 9-4: Using IS NULL to find missing values in the st column +SELECT est_number, + company, + city, + st, + zip +FROM meat_poultry_egg_inspect +WHERE st IS NULL; + +-- Listing 9-5: Using GROUP BY and count() to find inconsistent company names + +SELECT company, + count(*) AS company_count +FROM meat_poultry_egg_inspect +GROUP BY company +ORDER BY company ASC; + +-- Listing 9-6: Using length() and count() to test the zip column + +SELECT length(zip), + count(*) AS length_count +FROM meat_poultry_egg_inspect +GROUP BY length(zip) +ORDER BY length(zip) ASC; + +-- Listing 9-7: Filtering with length() to find short zip values + +SELECT st, + count(*) AS st_count +FROM meat_poultry_egg_inspect +WHERE length(zip) < 5 +GROUP BY st +ORDER BY st ASC; + +-- Listing 9-8: Backing up a table + +CREATE TABLE meat_poultry_egg_inspect_backup AS +SELECT * FROM meat_poultry_egg_inspect; + +-- Check number of records: +SELECT + (SELECT count(*) FROM meat_poultry_egg_inspect) AS original, + (SELECT count(*) FROM meat_poultry_egg_inspect_backup) AS backup; + +-- Listing 9-9: Creating and filling the st_copy column with ALTER TABLE and UPDATE + +ALTER TABLE meat_poultry_egg_inspect ADD COLUMN st_copy varchar(2); + +UPDATE meat_poultry_egg_inspect +SET st_copy = st; + +-- Listing 9-10: Checking values in the st and st_copy columns + +SELECT st, + st_copy +FROM meat_poultry_egg_inspect +ORDER BY st; + +-- Listing 9-11: Updating the st column for three establishments + +UPDATE meat_poultry_egg_inspect +SET st = 'MN' +WHERE est_number = 'V18677A'; + +UPDATE meat_poultry_egg_inspect +SET st = 'AL' +WHERE est_number = 'M45319+P45319'; + +UPDATE meat_poultry_egg_inspect +SET st = 'WI' +WHERE est_number = 'M263A+P263A+V263A'; + +-- Listing 9-12: Restoring original st column values + +-- Restoring from the column backup +UPDATE meat_poultry_egg_inspect +SET st = st_copy; + +-- Restoring from the table backup +UPDATE meat_poultry_egg_inspect original +SET st = backup.st +FROM meat_poultry_egg_inspect_backup backup +WHERE original.est_number = backup.est_number; + +-- Listing 9-13: Creating and filling the company_standard column + +ALTER TABLE meat_poultry_egg_inspect ADD COLUMN company_standard varchar(100); + +UPDATE meat_poultry_egg_inspect +SET company_standard = company; + +-- Listing 9-14: Use UPDATE to modify field values that match a string + +UPDATE meat_poultry_egg_inspect +SET company_standard = 'Armour-Eckrich Meats' +WHERE company LIKE 'Armour%'; + +SELECT company, company_standard +FROM meat_poultry_egg_inspect +WHERE company LIKE 'Armour%'; + +-- Listing 9-15: Creating and filling the zip_copy column + +ALTER TABLE meat_poultry_egg_inspect ADD COLUMN zip_copy varchar(5); + +UPDATE meat_poultry_egg_inspect +SET zip_copy = zip; + +-- Listing 9-16: Modify codes in the zip column missing two leading zeros + +UPDATE meat_poultry_egg_inspect +SET zip = '00' || zip +WHERE st IN('PR','VI') AND length(zip) = 3; + +-- Listing 9-17: Modify codes in the zip column missing one leading zero + +UPDATE meat_poultry_egg_inspect +SET zip = '0' || zip +WHERE st IN('CT','MA','ME','NH','NJ','RI','VT') AND length(zip) = 4; + +-- Listing 9-18: Creating and filling a state_regions table + +CREATE TABLE state_regions ( + st varchar(2) CONSTRAINT st_key PRIMARY KEY, + region varchar(20) NOT NULL +); + +COPY state_regions +FROM 'C:\YourDirectory\state_regions.csv' +WITH (FORMAT CSV, HEADER, DELIMITER ','); + +-- Listing 9-19: Adding and updating an inspection_date column + +ALTER TABLE meat_poultry_egg_inspect ADD COLUMN inspection_date date; + +UPDATE meat_poultry_egg_inspect inspect +SET inspection_date = '2019-12-01' +WHERE EXISTS (SELECT state_regions.region + FROM state_regions + WHERE inspect.st = state_regions.st + AND state_regions.region = 'New England'); + +-- Listing 9-20: Viewing updated inspection_date values + +SELECT st, inspection_date +FROM meat_poultry_egg_inspect +GROUP BY st, inspection_date +ORDER BY st; + +-- Listing 9-21: Delete rows matching an expression + +DELETE FROM meat_poultry_egg_inspect +WHERE st IN('PR','VI'); + +-- Listing 9-22: Remove a column from a table using DROP + +ALTER TABLE meat_poultry_egg_inspect DROP COLUMN zip_copy; + +-- Listing 9-23: Remove a table from a database using DROP + +DROP TABLE meat_poultry_egg_inspect_backup; + +-- Listing 9-24: Demonstrating a transaction block + +-- Start transaction and perform update +START TRANSACTION; + +UPDATE meat_poultry_egg_inspect +SET company = 'AGRO Merchantss Oakland LLC' +WHERE company = 'AGRO Merchants Oakland, LLC'; + +-- view changes +SELECT company +FROM meat_poultry_egg_inspect +WHERE company LIKE 'AGRO%' +ORDER BY company; + +-- Revert changes +ROLLBACK; + +-- See restored state +SELECT company +FROM meat_poultry_egg_inspect +WHERE company LIKE 'AGRO%' +ORDER BY company; + +-- Alternately, commit changes at the end: +START TRANSACTION; + +UPDATE meat_poultry_egg_inspect +SET company = 'AGRO Merchants Oakland LLC' +WHERE company = 'AGRO Merchants Oakland, LLC'; + +COMMIT; + +-- Listing 9-25: Backing up a table while adding and filling a new column + +CREATE TABLE meat_poultry_egg_inspect_backup AS +SELECT *, + '2018-02-07'::date AS reviewed_date +FROM meat_poultry_egg_inspect; + +-- Listing 9-26: Swapping table names using ALTER TABLE + +ALTER TABLE meat_poultry_egg_inspect RENAME TO meat_poultry_egg_inspect_temp; +ALTER TABLE meat_poultry_egg_inspect_backup RENAME TO meat_poultry_egg_inspect; +ALTER TABLE meat_poultry_egg_inspect_temp RENAME TO meat_poultry_egg_inspect_backup; + + diff --git a/Chapter_11/Chapter_11.sql b/Chapter_11/Chapter_11.sql new file mode 100644 index 0000000..ec41dbf --- /dev/null +++ b/Chapter_11/Chapter_11.sql @@ -0,0 +1,167 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 10 Code Examples +-------------------------------------------------------------- + +-- Listing 10-1: Create Census 2011-2015 ACS 5-Year stats table and import data + +CREATE TABLE acs_2011_2015_stats ( + geoid varchar(14) CONSTRAINT geoid_key PRIMARY KEY, + county varchar(50) NOT NULL, + st varchar(20) NOT NULL, + pct_travel_60_min numeric(5,3) NOT NULL, + pct_bachelors_higher numeric(5,3) NOT NULL, + pct_masters_higher numeric(5,3) NOT NULL, + median_hh_income integer, + CHECK (pct_masters_higher <= pct_bachelors_higher) +); + +COPY acs_2011_2015_stats +FROM 'C:\YourDirectory\acs_2011_2015_stats.csv' +WITH (FORMAT CSV, HEADER, DELIMITER ','); + +SELECT * FROM acs_2011_2015_stats; + +-- Listing 10-2: Using corr(Y, X) to measure the relationship between +-- education and income + +SELECT corr(median_hh_income, pct_bachelors_higher) + AS bachelors_income_r +FROM acs_2011_2015_stats; + +-- Listing 10-3: Using corr(Y, X) on additional variables + +SELECT + round( + corr(median_hh_income, pct_bachelors_higher)::numeric, 2 + ) AS bachelors_income_r, + round( + corr(pct_travel_60_min, median_hh_income)::numeric, 2 + ) AS income_travel_r, + round( + corr(pct_travel_60_min, pct_bachelors_higher)::numeric, 2 + ) AS bachelors_travel_r +FROM acs_2011_2015_stats; + +-- Listing 10-4: Regression slope and intercept functions + +SELECT + round( + regr_slope(median_hh_income, pct_bachelors_higher)::numeric, 2 + ) AS slope, + round( + regr_intercept(median_hh_income, pct_bachelors_higher)::numeric, 2 + ) AS y_intercept +FROM acs_2011_2015_stats; + +-- Listing 10-5: Calculating the coefficient of determination, or r-squared + +SELECT round( + regr_r2(median_hh_income, pct_bachelors_higher)::numeric, 3 + ) AS r_squared +FROM acs_2011_2015_stats; + +-- Bonus: Additional stats functions. +-- Variance +SELECT var_pop(median_hh_income) +FROM acs_2011_2015_stats; + +-- Standard deviation of the entire population +SELECT stddev_pop(median_hh_income) +FROM acs_2011_2015_stats; + +-- Covariance +SELECT covar_pop(median_hh_income, pct_bachelors_higher) +FROM acs_2011_2015_stats; + +-- Listing 10-6: The rank() and dense_rank() window functions + +CREATE TABLE widget_companies ( + id bigserial, + company varchar(30) NOT NULL, + widget_output integer NOT NULL +); + +INSERT INTO widget_companies (company, widget_output) +VALUES + ('Morse Widgets', 125000), + ('Springfield Widget Masters', 143000), + ('Best Widgets', 196000), + ('Acme Inc.', 133000), + ('District Widget Inc.', 201000), + ('Clarke Amalgamated', 620000), + ('Stavesacre Industries', 244000), + ('Bowers Widget Emporium', 201000); + +SELECT + company, + widget_output, + rank() OVER (ORDER BY widget_output DESC), + dense_rank() OVER (ORDER BY widget_output DESC) +FROM widget_companies; + +-- Listing 10-7: Applying rank() within groups using PARTITION BY + +CREATE TABLE store_sales ( + store varchar(30), + category varchar(30) NOT NULL, + unit_sales bigint NOT NULL, + CONSTRAINT store_category_key PRIMARY KEY (store, category) +); + +INSERT INTO store_sales (store, category, unit_sales) +VALUES + ('Broders', 'Cereal', 1104), + ('Wallace', 'Ice Cream', 1863), + ('Broders', 'Ice Cream', 2517), + ('Cramers', 'Ice Cream', 2112), + ('Broders', 'Beer', 641), + ('Cramers', 'Cereal', 1003), + ('Cramers', 'Beer', 640), + ('Wallace', 'Cereal', 980), + ('Wallace', 'Beer', 988); + +SELECT + category, + store, + unit_sales, + rank() OVER (PARTITION BY category ORDER BY unit_sales DESC) +FROM store_sales; + +-- Listing 10-8: Create and fill a 2015 FBI crime data table + +CREATE TABLE fbi_crime_data_2015 ( + st varchar(20), + city varchar(50), + population integer, + violent_crime integer, + property_crime integer, + burglary integer, + larceny_theft integer, + motor_vehicle_theft integer, + CONSTRAINT st_city_key PRIMARY KEY (st, city) +); + +COPY fbi_crime_data_2015 +FROM 'C:\YourDirectory\fbi_crime_data_2015.csv' +WITH (FORMAT CSV, HEADER, DELIMITER ','); + +SELECT * FROM fbi_crime_data_2015 +ORDER BY population DESC; + +-- Listing 10-9: Find property crime rates per thousand in cities with 500,000 +-- or more people + +SELECT + city, + st, + population, + property_crime, + round( + (property_crime::numeric / population) * 1000, 1 + ) AS pc_per_1000 +FROM fbi_crime_data_2015 +WHERE population >= 500000 +ORDER BY (property_crime::numeric / population) DESC; diff --git a/Chapter_12/Chapter_12.sql b/Chapter_12/Chapter_12.sql new file mode 100644 index 0000000..e2de8d1 --- /dev/null +++ b/Chapter_12/Chapter_12.sql @@ -0,0 +1,233 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 11 Code Examples +-------------------------------------------------------------- + +-- Listing 11-1: Extracting components of a timestamp value using date_part() + +SELECT + date_part('year', '2019-12-01 18:37:12 EST'::timestamptz) AS "year", + date_part('month', '2019-12-01 18:37:12 EST'::timestamptz) AS "month", + date_part('day', '2019-12-01 18:37:12 EST'::timestamptz) AS "day", + date_part('hour', '2019-12-01 18:37:12 EST'::timestamptz) AS "hour", + date_part('minute', '2019-12-01 18:37:12 EST'::timestamptz) AS "minute", + date_part('seconds', '2019-12-01 18:37:12 EST'::timestamptz) AS "seconds", + date_part('timezone_hour', '2019-12-01 18:37:12 EST'::timestamptz) AS "tz", + date_part('week', '2019-12-01 18:37:12 EST'::timestamptz) AS "week", + date_part('quarter', '2019-12-01 18:37:12 EST'::timestamptz) AS "quarter", + date_part('epoch', '2019-12-01 18:37:12 EST'::timestamptz) AS "epoch"; + +-- Bonus: Using the SQL-standard extract() for similar datetime parsing: + +SELECT extract('year' from '2019-12-01 18:37:12 EST'::timestamptz) AS "year"; + +-- Listing 11-2: Three functions for making datetimes from components + +-- make a date +SELECT make_date(2018, 2, 22); +-- make a time +SELECT make_time(18, 4, 30.3); +-- make a timestamp with time zone +SELECT make_timestamptz(2018, 2, 22, 18, 4, 30.3, 'Europe/Lisbon'); + +-- Bonus: Retrieving the current date and time + +SELECT + current_date, + current_time, + current_timestamp, + localtime, + localtimestamp, + now(); + +-- Listing 11-3: Comparing current_timestamp and clock_timestamp() during row insert + +CREATE TABLE current_time_example ( + time_id bigserial, + current_timestamp_col timestamp with time zone, + clock_timestamp_col timestamp with time zone +); + +INSERT INTO current_time_example (current_timestamp_col, clock_timestamp_col) + (SELECT current_timestamp, + clock_timestamp() + FROM generate_series(1,1000)); + +SELECT * FROM current_time_example; + +-- Time Zones + +-- Listing 11-4: Showing your PostgreSQL server's default time zone + +SHOW timezone; -- Note: You can see all run-time defaults with SHOW ALL; + +-- Listing 11-5: Showing time zone abbreviations and names + +SELECT * FROM pg_timezone_abbrevs; +SELECT * FROM pg_timezone_names; + +-- Filter to find one +SELECT * FROM pg_timezone_names +WHERE name LIKE 'Europe%'; + +-- Listing 11-6: Setting the time zone for a client session + +SET timezone TO 'US/Pacific'; + +CREATE TABLE time_zone_test ( + test_date timestamp with time zone +); +INSERT INTO time_zone_test VALUES ('2020-01-01 4:00'); + +SELECT test_date +FROM time_zone_test; + +SET timezone TO 'US/Eastern'; + +SELECT test_date +FROM time_zone_test; + +SELECT test_date AT TIME ZONE 'Asia/Seoul' +FROM time_zone_test; + + +-- Math with dates! + +SELECT '9/30/1929'::date - '9/27/1929'::date; +SELECT '9/30/1929'::date + '5 years'::interval; + + +-- Taxi Rides + +-- Listing 11-7: Creating a table and importing NYC yellow taxi data + +CREATE TABLE nyc_yellow_taxi_trips_2016_06_01 ( + trip_id bigserial PRIMARY KEY, + vendor_id varchar(1) NOT NULL, + tpep_pickup_datetime timestamp with time zone NOT NULL, + tpep_dropoff_datetime timestamp with time zone NOT NULL, + passenger_count integer NOT NULL, + trip_distance numeric(8,2) NOT NULL, + pickup_longitude numeric(18,15) NOT NULL, + pickup_latitude numeric(18,15) NOT NULL, + rate_code_id varchar(2) NOT NULL, + store_and_fwd_flag varchar(1) NOT NULL, + dropoff_longitude numeric(18,15) NOT NULL, + dropoff_latitude numeric(18,15) NOT NULL, + payment_type varchar(1) NOT NULL, + fare_amount numeric(9,2) NOT NULL, + extra numeric(9,2) NOT NULL, + mta_tax numeric(5,2) NOT NULL, + tip_amount numeric(9,2) NOT NULL, + tolls_amount numeric(9,2) NOT NULL, + improvement_surcharge numeric(9,2) NOT NULL, + total_amount numeric(9,2) NOT NULL +); + +COPY nyc_yellow_taxi_trips_2016_06_01 ( + vendor_id, + tpep_pickup_datetime, + tpep_dropoff_datetime, + passenger_count, + trip_distance, + pickup_longitude, + pickup_latitude, + rate_code_id, + store_and_fwd_flag, + dropoff_longitude, + dropoff_latitude, + payment_type, + fare_amount, + extra, + mta_tax, + tip_amount, + tolls_amount, + improvement_surcharge, + total_amount + ) +FROM 'C:\YourDirectory\yellow_tripdata_2016_06_01.csv' +WITH (FORMAT CSV, HEADER, DELIMITER ','); + +CREATE INDEX tpep_pickup_idx +ON nyc_yellow_taxi_trips_2016_06_01 (tpep_pickup_datetime); + +SELECT count(*) FROM nyc_yellow_taxi_trips_2016_06_01; + +-- Listing 11-8: Counting taxi trips by hour + +SELECT + date_part('hour', tpep_pickup_datetime) AS trip_hour, + count(*) +FROM nyc_yellow_taxi_trips_2016_06_01 +GROUP BY trip_hour +ORDER BY trip_hour; + +-- Listing 11-9: Exporting taxi pickups per hour to a CSV file + +COPY + (SELECT + date_part('hour', tpep_pickup_datetime) AS trip_hour, + count(*) + FROM nyc_yellow_taxi_trips_2016_06_01 + GROUP BY trip_hour + ORDER BY trip_hour + ) +TO 'C:\YourDirectory\hourly_pickups_2016_06_01.csv' +WITH (FORMAT CSV, HEADER, DELIMITER ','); + +-- Listing 11-10: Calculating median trip time by hour + +SELECT + date_part('hour', tpep_pickup_datetime) AS trip_hour, + percentile_cont(.5) + WITHIN GROUP (ORDER BY + tpep_dropoff_datetime - tpep_pickup_datetime) AS median_trip +FROM nyc_yellow_taxi_trips_2016_06_01 +GROUP BY trip_hour +ORDER BY trip_hour; + +-- Listing 11-11: Creating a table to hold train trip data + +SET timezone TO 'US/Central'; + +CREATE TABLE train_rides ( + trip_id bigserial PRIMARY KEY, + segment varchar(50) NOT NULL, + departure timestamp with time zone NOT NULL, + arrival timestamp with time zone NOT NULL +); + +INSERT INTO train_rides (segment, departure, arrival) +VALUES + ('Chicago to New York', '2017-11-13 21:30 CST', '2017-11-14 18:23 EST'), + ('New York to New Orleans', '2017-11-15 14:15 EST', '2017-11-16 19:32 CST'), + ('New Orleans to Los Angeles', '2017-11-17 13:45 CST', '2017-11-18 9:00 PST'), + ('Los Angeles to San Francisco', '2017-11-19 10:10 PST', '2017-11-19 21:24 PST'), + ('San Francisco to Denver', '2017-11-20 9:10 PST', '2017-11-21 18:38 MST'), + ('Denver to Chicago', '2017-11-22 19:10 MST', '2017-11-23 14:50 CST'); + +SELECT * FROM train_rides; + +-- Listing 11-12: Calculating the length of each trip segment + +SELECT segment, + to_char(departure, 'YYYY-MM-DD HH12:MI a.m. TZ') AS departure, + arrival - departure AS segment_time +FROM train_rides; + +-- Listing 11-13: Calculating cumulative intervals using OVER + +SELECT segment, + arrival - departure AS segment_time, + sum(arrival - departure) OVER (ORDER BY trip_id) AS cume_time +FROM train_rides; + +-- Listing 11-14: Better formatting for cumulative trip time + +SELECT segment, + arrival - departure AS segment_time, + sum(date_part('epoch', (arrival - departure))) + OVER (ORDER BY trip_id) * interval '1 second' AS cume_time +FROM train_rides; diff --git a/Chapter_13/Chapter_13.sql b/Chapter_13/Chapter_13.sql new file mode 100644 index 0000000..3738b39 --- /dev/null +++ b/Chapter_13/Chapter_13.sql @@ -0,0 +1,310 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 12 Code Examples +-------------------------------------------------------------- + +-- Listing 12-1: Using a subquery in a WHERE clause + +SELECT geo_name, + state_us_abbreviation, + p0010001 +FROM us_counties_2010 +WHERE p0010001 >= ( + SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001) + FROM us_counties_2010 + ) +ORDER BY p0010001 DESC; + +-- Listing 12-2: Using a subquery in a WHERE clause for DELETE + +CREATE TABLE us_counties_2010_top10 AS +SELECT * FROM us_counties_2010; + +DELETE FROM us_counties_2010_top10 +WHERE p0010001 < ( + SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001) + FROM us_counties_2010_top10 + ); + +SELECT count(*) FROM us_counties_2010_top10; + +-- Listing 12-3: Subquery as a derived table in a FROM clause + +SELECT round(calcs.average, 0) as average, + calcs.median, + round(calcs.average - calcs.median, 0) AS median_average_diff +FROM ( + SELECT avg(p0010001) AS average, + percentile_cont(.5) + WITHIN GROUP (ORDER BY p0010001)::numeric(10,1) AS median + FROM us_counties_2010 + ) +AS calcs; + +-- Listing 12-4: Joining two derived tables + +SELECT census.state_us_abbreviation AS st, + census.st_population, + plants.plant_count, + round((plants.plant_count/census.st_population::numeric(10,1)) * 1000000, 1) + AS plants_per_million +FROM + ( + SELECT st, + count(*) AS plant_count + FROM meat_poultry_egg_inspect + GROUP BY st + ) + AS plants +JOIN + ( + SELECT state_us_abbreviation, + sum(p0010001) AS st_population + FROM us_counties_2010 + GROUP BY state_us_abbreviation + ) + AS census +ON plants.st = census.state_us_abbreviation +ORDER BY plants_per_million DESC; + +-- Listing 12-5: Adding a subquery to a column list + +SELECT geo_name, + state_us_abbreviation AS st, + p0010001 AS total_pop, + (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) + FROM us_counties_2010) AS us_median +FROM us_counties_2010; + +-- Listing 12-6: Using a subquery expression in a calculation + +SELECT geo_name, + state_us_abbreviation AS st, + p0010001 AS total_pop, + (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) + FROM us_counties_2010) AS us_median, + p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) + FROM us_counties_2010) AS diff_from_median +FROM us_counties_2010 +WHERE (p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001) + FROM us_counties_2010)) + BETWEEN -1000 AND 1000; + + +-- BONUS: Subquery expressions +-- If you'd like to try the IN, EXISTS, and NOT EXISTS expressions on pages 199-200, +-- here's the code to create a retirees table. The queries below are similar +-- to the hypothetical examples on pages 199 and 200. You will need the +-- employees table you created in Chapter 6. + +-- Create table and insert data +CREATE TABLE retirees ( + id int, + first_name varchar(50), + last_name varchar(50) +); + +INSERT INTO retirees +VALUES (2, 'Lee', 'Smith'), + (4, 'Janet', 'King'); + +-- Generating values for the IN operator +SELECT first_name, last_name +FROM employees +WHERE emp_id IN ( + SELECT id + FROM retirees); + +-- Checking whether values exist (returns all rows from employees +-- if the expression evaluates as true) +SELECT first_name, last_name +FROM employees +WHERE EXISTS ( + SELECT id + FROM retirees); + +-- Using a correlated subquery to find matching values from employees +-- in retirees. +SELECT first_name, last_name +FROM employees +WHERE EXISTS ( + SELECT id + FROM retirees + WHERE id = employees.emp_id); + + + +-- Listing 12-7: Using a simple CTE to find large counties + +WITH + large_counties (geo_name, st, p0010001) +AS + ( + SELECT geo_name, state_us_abbreviation, p0010001 + FROM us_counties_2010 + WHERE p0010001 >= 100000 + ) +SELECT st, count(*) +FROM large_counties +GROUP BY st +ORDER BY count(*) DESC; + +-- Bonus: You can also write this query as: +SELECT state_us_abbreviation, count(*) +FROM us_counties_2010 +WHERE p0010001 >= 100000 +GROUP BY state_us_abbreviation +ORDER BY count(*) DESC; + +-- Listing 12-8: Using CTEs in a table join + +WITH + counties (st, population) AS + (SELECT state_us_abbreviation, sum(population_count_100_percent) + FROM us_counties_2010 + GROUP BY state_us_abbreviation), + + plants (st, plants) AS + (SELECT st, count(*) AS plants + FROM meat_poultry_egg_inspect + GROUP BY st) + +SELECT counties.st, + population, + plants, + round((plants/population::numeric(10,1))*1000000, 1) AS per_million +FROM counties JOIN plants +ON counties.st = plants.st +ORDER BY per_million DESC; + +-- Listing 12-9: Using CTEs to minimize redundant code + +WITH us_median AS + (SELECT percentile_cont(.5) + WITHIN GROUP (ORDER BY p0010001) AS us_median_pop + FROM us_counties_2010) + +SELECT geo_name, + state_us_abbreviation AS st, + p0010001 AS total_pop, + us_median_pop, + p0010001 - us_median_pop AS diff_from_median +FROM us_counties_2010 CROSS JOIN us_median +WHERE (p0010001 - us_median_pop) + BETWEEN -1000 AND 1000; + + +-- Cross tabulations +-- Install the crosstab() function via the tablefunc module + +CREATE EXTENSION tablefunc; + +-- Listing 12-10: Creating and filling the ice_cream_survey table + +CREATE TABLE ice_cream_survey ( + response_id integer PRIMARY KEY, + office varchar(20), + flavor varchar(20) +); + +COPY ice_cream_survey +FROM 'C:\YourDirectory\ice_cream_survey.csv' +WITH (FORMAT CSV, HEADER); + +-- Listing 12-11: Generating the ice cream survey crosstab + +SELECT * +FROM crosstab('SELECT office, + flavor, + count(*) + FROM ice_cream_survey + GROUP BY office, flavor + ORDER BY office', + + 'SELECT flavor + FROM ice_cream_survey + GROUP BY flavor + ORDER BY flavor') + +AS (office varchar(20), + chocolate bigint, + strawberry bigint, + vanilla bigint); + +-- Listing 12-12: Creating and filling a temperature_readings table + +CREATE TABLE temperature_readings ( + reading_id bigserial PRIMARY KEY, + station_name varchar(50), + observation_date date, + max_temp integer, + min_temp integer +); + +COPY temperature_readings + (station_name, observation_date, max_temp, min_temp) +FROM 'C:\YourDirectory\temperature_readings.csv' +WITH (FORMAT CSV, HEADER); + +-- Listing 12-13: Generating the temperature readings crosstab + +SELECT * +FROM crosstab('SELECT + station_name, + date_part(''month'', observation_date), + percentile_cont(.5) + WITHIN GROUP (ORDER BY max_temp) + FROM temperature_readings + GROUP BY station_name, + date_part(''month'', observation_date) + ORDER BY station_name', + + 'SELECT month + FROM generate_series(1,12) month') + +AS (station varchar(50), + jan numeric(3,0), + feb numeric(3,0), + mar numeric(3,0), + apr numeric(3,0), + may numeric(3,0), + jun numeric(3,0), + jul numeric(3,0), + aug numeric(3,0), + sep numeric(3,0), + oct numeric(3,0), + nov numeric(3,0), + dec numeric(3,0) +); + +-- Listing 12-14: Re-classifying temperature data with CASE + +SELECT max_temp, + CASE WHEN max_temp >= 90 THEN 'Hot' + WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm' + WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant' + WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold' + WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing' + ELSE 'Inhumane' + END AS temperature_group +FROM temperature_readings; + +-- Listing 12-15: Using CASE in a Common Table Expression + +WITH temps_collapsed (station_name, max_temperature_group) AS + (SELECT station_name, + CASE WHEN max_temp >= 90 THEN 'Hot' + WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm' + WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant' + WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold' + WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing' + ELSE 'Inhumane' + END + FROM temperature_readings) + +SELECT station_name, max_temperature_group, count(*) +FROM temps_collapsed +GROUP BY station_name, max_temperature_group +ORDER BY station_name, count(*) DESC; diff --git a/Chapter_14/Chapter_14.sql b/Chapter_14/Chapter_14.sql new file mode 100644 index 0000000..83dc578 --- /dev/null +++ b/Chapter_14/Chapter_14.sql @@ -0,0 +1,363 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 13 Code Examples +-------------------------------------------------------------- + +-- Commonly used string functions +-- Full list at https://www.postgresql.org/docs/current/static/functions-string.html + +-- Case formatting +SELECT upper('Neal7'); +SELECT lower('Randy'); +SELECT initcap('at the end of the day'); +-- Note initcap's imperfect for acronyms +SELECT initcap('Practical SQL'); + +-- Character Information +SELECT char_length(' Pat '); +SELECT length(' Pat '); +SELECT position(', ' in 'Tan, Bella'); + +-- Removing characters +SELECT trim('s' from 'socks'); +SELECT trim(trailing 's' from 'socks'); +SELECT trim(' Pat '); +SELECT char_length(trim(' Pat ')); -- note the length change +SELECT ltrim('socks', 's'); +SELECT rtrim('socks', 's'); + +-- Extracting and replacing characters +SELECT left('703-555-1212', 3); +SELECT right('703-555-1212', 8); +SELECT replace('bat', 'b', 'c'); + + +-- Table 13-2: Regular Expression Matching Examples + +-- Any character one or more times +SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '.+'); +-- One or two digits followed by a space and p.m. +SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\d{1,2} (?:a.m.|p.m.)'); +-- One or more word characters at the start +SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '^\w+'); +-- One or more word characters followed by any character at the end. +SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\w+.$'); +-- The words May or June +SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from 'May|June'); +-- Four digits +SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\d{4}'); +-- May followed by a space, digit, comma, space, and four digits. +SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from 'May \d, \d{4}'); + + +-- Turning Text to Data with Regular Expression Functions + +-- Listing 13-2: Creating and loading the crime_reports table +-- Data from https://sheriff.loudoun.gov/dailycrime + +CREATE TABLE crime_reports ( + crime_id bigserial PRIMARY KEY, + date_1 timestamp with time zone, + date_2 timestamp with time zone, + street varchar(250), + city varchar(100), + crime_type varchar(100), + description text, + case_number varchar(50), + original_text text NOT NULL +); + +COPY crime_reports (original_text) +FROM 'C:\YourDirectory\crime_reports.csv' +WITH (FORMAT CSV, HEADER OFF, QUOTE '"'); + +SELECT original_text FROM crime_reports; + +-- Listing 13-3: Using regexp_match() to find the first date +SELECT crime_id, + regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') +FROM crime_reports; + +-- Listing 13-4: Using the regexp_matches() function with the 'g' flag +SELECT crime_id, + regexp_matches(original_text, '\d{1,2}\/\d{1,2}\/\d{2}', 'g') +FROM crime_reports; + +-- Listing 13-5: Using regexp_match() to find the second date +-- Note that the result includes an unwanted hyphen +SELECT crime_id, + regexp_match(original_text, '-\d{1,2}\/\d{1,2}\/\d{1,2}') +FROM crime_reports; + +-- Listing 13-6: Using a capture group to return only the date +-- Eliminates the hyphen +SELECT crime_id, + regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})') +FROM crime_reports; + +-- Listing 13-7: Matching case number, date, crime type, and city + +SELECT + regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number, + regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') AS date_1, + regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type, + regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n') + AS city +FROM crime_reports; + +-- Bonus: Get all parsed elements at once + +SELECT crime_id, + regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') AS date_1, + CASE WHEN EXISTS (SELECT regexp_matches(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})')) + THEN regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})') + ELSE NULL + END AS date_2, + regexp_match(original_text, '\/\d{2}\n(\d{4})') AS hour_1, + CASE WHEN EXISTS (SELECT regexp_matches(original_text, '\/\d{2}\n\d{4}-(\d{4})')) + THEN regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})') + ELSE NULL + END AS hour_2, + regexp_match(original_text, 'hrs.\n(\d+ .+(?:Sq.|Plz.|Dr.|Ter.|Rd.))') AS street, + regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n') AS city, + regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type, + regexp_match(original_text, ':\s(.+)(?:C0|SO)') AS description, + regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number +FROM crime_reports; + +-- Listing 13-8: Retrieving a value from within an array + +SELECT + crime_id, + (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1] + AS case_number +FROM crime_reports; + +-- Listing 13-9: Updating the crime_reports date_1 column + +UPDATE crime_reports +SET date_1 = +( + (regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1] + || ' ' || + (regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1] + ||' US/Eastern' +)::timestamptz; + +SELECT crime_id, + date_1, + original_text +FROM crime_reports; + +-- Listing 13-10: Updating all crime_reports columns + +UPDATE crime_reports +SET date_1 = + ( + (regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1] + || ' ' || + (regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1] + ||' US/Eastern' + )::timestamptz, + + date_2 = + CASE + -- if there is no second date but there is a second hour + WHEN (SELECT regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})') IS NULL) + AND (SELECT regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})') IS NOT NULL) + THEN + ((regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1] + || ' ' || + (regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})'))[1] + ||' US/Eastern' + )::timestamptz + + -- if there is both a second date and second hour + WHEN (SELECT regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})') IS NOT NULL) + AND (SELECT regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})') IS NOT NULL) + THEN + ((regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})'))[1] + || ' ' || + (regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})'))[1] + ||' US/Eastern' + )::timestamptz + -- if neither of those conditions exist, provide a NULL + ELSE NULL + END, + street = (regexp_match(original_text, 'hrs.\n(\d+ .+(?:Sq.|Plz.|Dr.|Ter.|Rd.))'))[1], + city = (regexp_match(original_text, + '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n'))[1], + crime_type = (regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):'))[1], + description = (regexp_match(original_text, ':\s(.+)(?:C0|SO)'))[1], + case_number = (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1]; + +-- Listing 13-11: Viewing selected crime data + +SELECT date_1, + street, + city, + crime_type +FROM crime_reports; + +-- Listing 13-12: Using regular expressions in a WHERE clause + +SELECT geo_name +FROM us_counties_2010 +WHERE geo_name ~* '(.+lade.+|.+lare.+)' +ORDER BY geo_name; + +SELECT geo_name +FROM us_counties_2010 +WHERE geo_name ~* '.+ash.+' AND geo_name !~ 'Wash.+' +ORDER BY geo_name; + + +-- Listing 13-13: Regular expression functions to replace and split + +SELECT regexp_replace('05/12/2018', '\d{4}', '2017'); + +SELECT regexp_split_to_table('Four,score,and,seven,years,ago', ','); + +SELECT regexp_split_to_array('Phil Mike Tony Steve', ' '); + +-- Listing 13-14: Finding an array length + +SELECT array_length(regexp_split_to_array('Phil Mike Tony Steve', ' '), 1); + + +-- FULL TEXT SEARCH + +-- Full-text search operators: +-- & (AND) +-- | (OR) +-- ! (NOT) + +-- Listing 13-15: Converting text to tsvector data + +SELECT to_tsvector('I am walking across the sitting room to sit with you.'); + +-- Listing 13-16: Converting search terms to tsquery data + +SELECT to_tsquery('walking & sitting'); + +-- Listing 13-17: Querying a tsvector type with a tsquery + +SELECT to_tsvector('I am walking across the sitting room') @@ to_tsquery('walking & sitting'); + +SELECT to_tsvector('I am walking across the sitting room') @@ to_tsquery('walking & running'); + +-- Listing 13-18: Creating and filling the president_speeches table + +-- Sources: +-- https://archive.org/details/State-of-the-Union-Addresses-1945-2006 +-- http://www.presidency.ucsb.edu/ws/index.php +-- https://www.eisenhower.archives.gov/all_about_ike/speeches.html + +CREATE TABLE president_speeches ( + sotu_id serial PRIMARY KEY, + president varchar(100) NOT NULL, + title varchar(250) NOT NULL, + speech_date date NOT NULL, + speech_text text NOT NULL, + search_speech_text tsvector +); + +COPY president_speeches (president, title, speech_date, speech_text) +FROM 'C:\YourDirectory\sotu-1946-1977.csv' +WITH (FORMAT CSV, DELIMITER '|', HEADER OFF, QUOTE '@'); + +SELECT * FROM president_speeches; + +-- Listing 13-19: Converting speeches to tsvector in the search_speech_text column + +UPDATE president_speeches +SET search_speech_text = to_tsvector('english', speech_text); + +-- Listing 13-20: Creating a GIN index for text search + +CREATE INDEX search_idx ON president_speeches USING gin(search_speech_text); + +-- Listing 13-21: Finding speeches containing the word "Vietnam" + +SELECT president, speech_date +FROM president_speeches +WHERE search_speech_text @@ to_tsquery('Vietnam') +ORDER BY speech_date; + +-- Listing 13-22: Displaying search results with ts_headline() + +SELECT president, + speech_date, + ts_headline(speech_text, to_tsquery('Vietnam'), + 'StartSel = <, + StopSel = >, + MinWords=5, + MaxWords=7, + MaxFragments=1') +FROM president_speeches +WHERE search_speech_text @@ to_tsquery('Vietnam'); + +-- Listing 13-23: Finding speeches with the word "transportation" but not "roads" + +SELECT president, + speech_date, + ts_headline(speech_text, to_tsquery('transportation & !roads'), + 'StartSel = <, + StopSel = >, + MinWords=5, + MaxWords=7, + MaxFragments=1') +FROM president_speeches +WHERE search_speech_text @@ to_tsquery('transportation & !roads'); + +-- Listing 13-24: Find speeches where "defense" follows "military" + +SELECT president, + speech_date, + ts_headline(speech_text, to_tsquery('military <-> defense'), + 'StartSel = <, + StopSel = >, + MinWords=5, + MaxWords=7, + MaxFragments=1') +FROM president_speeches +WHERE search_speech_text @@ to_tsquery('military <-> defense'); + +-- Bonus: Example with a distance of 2: +SELECT president, + speech_date, + ts_headline(speech_text, to_tsquery('military <2> defense'), + 'StartSel = <, + StopSel = >, + MinWords=5, + MaxWords=7, + MaxFragments=2') +FROM president_speeches +WHERE search_speech_text @@ to_tsquery('military <2> defense'); + +-- Listing 13-25: Scoring relevance with ts_rank() + +SELECT president, + speech_date, + ts_rank(search_speech_text, + to_tsquery('war & security & threat & enemy')) AS score +FROM president_speeches +WHERE search_speech_text @@ to_tsquery('war & security & threat & enemy') +ORDER BY score DESC +LIMIT 5; + +-- Listing 13-26: Normalizing ts_rank() by speech length + +SELECT president, + speech_date, + ts_rank(search_speech_text, + to_tsquery('war & security & threat & enemy'), 2)::numeric + AS score +FROM president_speeches +WHERE search_speech_text @@ to_tsquery('war & security & threat & enemy') +ORDER BY score DESC +LIMIT 5; + + diff --git a/Chapter_15/Chapter_15.sql b/Chapter_15/Chapter_15.sql new file mode 100644 index 0000000..8e07dda --- /dev/null +++ b/Chapter_15/Chapter_15.sql @@ -0,0 +1,252 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 14 Code Examples +-------------------------------------------------------------- + +-- Listing 14-1: Creating a gis_analysis database + +CREATE DATABASE gis_analysis; +-- Note: Switch to this new database before continuing the examples + +-- Listing 14-2: Loading the PostGIS extension + +CREATE EXTENSION postgis; + +SELECT postgis_full_version(); -- shows PostGIS version + +-- Listing 14-3: Retrieving the well-known text for SRID 4326 + +SELECT srtext +FROM spatial_ref_sys +WHERE srid = 4326; + +-- Listing 14-4: Using ST_GeomFromText() to create spatial objects + +SELECT ST_GeomFromText('POINT(-74.9233606 42.699992)', 4326); + +SELECT ST_GeomFromText('LINESTRING(-74.9 42.7, -75.1 42.7)', 4326); + +SELECT ST_GeomFromText('POLYGON((-74.9 42.7, -75.1 42.7, + -75.1 42.6, -74.9 42.7))', 4326); + +SELECT ST_GeomFromText('MULTIPOINT (-74.9 42.7, -75.1 42.7)', 4326); + +SELECT ST_GeomFromText('MULTILINESTRING((-76.27 43.1, -76.06 43.08), + (-76.2 43.3, -76.2 43.4, + -76.4 43.1))', 4326); + +SELECT ST_GeomFromText('MULTIPOLYGON(( + (-74.92 42.7, -75.06 42.71, + -75.07 42.64, -74.92 42.7), + (-75.0 42.66, -75.0 42.64, + -74.98 42.64, -74.98 42.66, + -75.0 42.66)))', 4326); + +-- Listing 14-5: Using ST_GeogFromText() to create spatial objects + +SELECT +ST_GeogFromText('SRID=4326;MULTIPOINT(-74.9 42.7, -75.1 42.7, -74.924 42.6)'); + +-- Listing 14-6: Functions specific to making points + +SELECT ST_PointFromText('POINT(-74.9233606 42.699992)', 4326); + +SELECT ST_MakePoint(-74.9233606, 42.699992); +SELECT ST_SetSRID(ST_MakePoint(-74.9233606, 42.699992), 4326); + +-- Listing 14-7: Functions specific to making LineStrings + +SELECT ST_LineFromText('LINESTRING(-105.90 35.67,-105.91 35.67)', 4326); +SELECT ST_MakeLine(ST_MakePoint(-74.92, 42.69), ST_MakePoint(-74.12, 42.45)); + +-- Listing 14-8: Functions specific to making Polygons + +SELECT ST_PolygonFromText('POLYGON((-74.9 42.7, -75.1 42.7, + -75.1 42.6, -74.9 42.7))', 4326); + +SELECT ST_MakePolygon( + ST_GeomFromText('LINESTRING(-74.92 42.7, -75.06 42.71, + -75.07 42.64, -74.92 42.7)', 4326)); + +SELECT ST_MPolyFromText('MULTIPOLYGON(( + (-74.92 42.7, -75.06 42.71, + -75.07 42.64, -74.92 42.7), + (-75.0 42.66, -75.0 42.64, + -74.98 42.64, -74.98 42.66, + -75.0 42.66) + ))', 4326); + + +-- ANALYZING FARMERS MARKETS DATA +-- https://catalog.data.gov/dataset/farmers-markets-geographic-data +-- https://www.ams.usda.gov/local-food-directories/farmersmarkets + + +-- Listing 14-9: Create and load the farmers_markets table + +CREATE TABLE farmers_markets ( + fmid bigint PRIMARY KEY, + market_name varchar(100) NOT NULL, + street varchar(180), + city varchar(60), + county varchar(25), + st varchar(20) NOT NULL, + zip varchar(10), + longitude numeric(10,7), + latitude numeric(10,7), + organic varchar(1) NOT NULL +); + +COPY farmers_markets +FROM 'C:\YourDirectory\farmers_markets.csv' +WITH (FORMAT CSV, HEADER); + +SELECT count(*) FROM farmers_markets; -- should return 8,681 rows + +-- Listing 14-10: Creating and indexing a geography column +-- There's also a function: https://postgis.net/docs/AddGeometryColumn.html + +-- Add column +ALTER TABLE farmers_markets ADD COLUMN geog_point geography(POINT,4326); + +-- Now fill that column with the lat/long +UPDATE farmers_markets +SET geog_point = ST_SetSRID( + ST_MakePoint(longitude,latitude),4326 + )::geography; + +-- Add a GiST index +CREATE INDEX market_pts_idx ON farmers_markets USING GIST (geog_point); + +-- View the geography column +SELECT longitude, + latitude, + geog_point, + ST_AsText(geog_point) +FROM farmers_markets +WHERE longitude IS NOT NULL +LIMIT 5; + +-- Listing 14-11: Using ST_DWithin() to locate farmers' markets within 10 kilometers of a point + +SELECT market_name, + city, + st +FROM farmers_markets +WHERE ST_DWithin(geog_point, + ST_GeogFromText('POINT(-93.6204386 41.5853202)'), + 10000) +ORDER BY market_name; + +-- Listing 14-12: Using ST_Distance() to calculate the miles between Yankee Stadium +-- and Citi Field (Mets) +-- 1609.344 meters/mile + +SELECT ST_Distance( + ST_GeogFromText('POINT(-73.9283685 40.8296466)'), + ST_GeogFromText('POINT(-73.8480153 40.7570917)') + ) / 1609.344 AS mets_to_yanks; + +-- Listing 14-13: Using ST_Distance() for each row in farmers_markets + +SELECT market_name, + city, + round( + (ST_Distance(geog_point, + ST_GeogFromText('POINT(-93.6204386 41.5853202)') + ) / 1609.344)::numeric(8,5), 2 + ) AS miles_from_dt +FROM farmers_markets +WHERE ST_DWithin(geog_point, + ST_GeogFromText('POINT(-93.6204386 41.5853202)'), + 10000) +ORDER BY miles_from_dt ASC; + + +-- WORKING WITH SHAPEFILES + +-- Resources: + -- TIGER/LineĀ® Shapefiles and TIGER/LineĀ® Files + -- https://www.census.gov/geo/maps-data/data/tiger-line.html + -- Cartographic Boundary Shapefiles - Counties + -- https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html + +-- Listing 14-14: Checking the geom column's well-known text representation + +SELECT ST_AsText(geom) +FROM us_counties_2010_shp +LIMIT 1; + +-- Listing 14-15: Find the largest counties by area using ST_Area() + +SELECT name10, + statefp10 AS st, + round( + ( ST_Area(geom::geography) / 2589988.110336 )::numeric, 2 + ) AS square_miles +FROM us_counties_2010_shp +ORDER BY square_miles DESC +LIMIT 5; + +-- Listing 14-16: Using ST_Within() to find the county belonging to a pair of coordinates + +SELECT name10, + statefp10 +FROM us_counties_2010_shp +WHERE ST_Within('SRID=4269;POINT(-118.3419063 34.0977076)'::geometry, geom); + +-- SPATIAL JOINS +-- SANTA FE WATER AND ROAD DATA +-- http://www.santafenm.gov/santa_fe_river + +-- Census 2016 Tiger/Line roads, water +-- https://www.census.gov/geo/maps-data/data/tiger-line.html +-- https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2016&layergroup=Roads +-- https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2016&layergroup=Water + +-- RTTYP - Route Type Code Description +-- https://www.census.gov/geo/reference/rttyp.html +-- C County +-- I Interstate +-- M Common Name +-- O Other +-- S State recognized +-- U U.S. + +-- MTFCC MAF/TIGER feature class code +-- https://www.census.gov/geo/reference/mtfcc.html +-- Here, H3010: A natural flowing waterway + +-- Listing 14-17: Using ST_GeometryType() to determine geometry + +SELECT ST_GeometryType(geom) +FROM santafe_linearwater_2016 +LIMIT 1; + +SELECT ST_GeometryType(geom) +FROM santafe_roads_2016 +LIMIT 1; + +-- Listing 14-18: Spatial join with ST_Intersects() to find roads crossing the Santa Fe river + +SELECT water.fullname AS waterway, + roads.rttyp, + roads.fullname AS road +FROM santafe_linearwater_2016 water JOIN santafe_roads_2016 roads + ON ST_Intersects(water.geom, roads.geom) +WHERE water.fullname = 'Santa Fe Riv' +ORDER BY roads.fullname; + +-- Listing 14-19: Using ST_Intersection() to show where roads cross the river + +SELECT water.fullname AS waterway, + roads.rttyp, + roads.fullname AS road, + ST_AsText(ST_Intersection(water.geom, roads.geom)) +FROM santafe_linearwater_2016 water JOIN santafe_roads_2016 roads + ON ST_Intersects(water.geom, roads.geom) +WHERE water.fullname = 'Santa Fe Riv' +ORDER BY roads.fullname +LIMIT 5; diff --git a/Chapter_17/Chapter_17.sql b/Chapter_17/Chapter_17.sql new file mode 100644 index 0000000..44a408a --- /dev/null +++ b/Chapter_17/Chapter_17.sql @@ -0,0 +1,321 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 15 Code Examples +-------------------------------------------------------------- + +-- VIEWS + +-- Listing 15-1: Creating a view that displays Nevada 2010 counties + +CREATE OR REPLACE VIEW nevada_counties_pop_2010 AS + SELECT geo_name, + state_fips, + county_fips, + p0010001 AS pop_2010 + FROM us_counties_2010 + WHERE state_us_abbreviation = 'NV' + ORDER BY county_fips; + +-- Listing 15-2: Querying the nevada_counties_pop_2010 view + +SELECT * +FROM nevada_counties_pop_2010 +LIMIT 5; + +-- Listing 15-3: Creating a view showing population change for US counties + +CREATE OR REPLACE VIEW county_pop_change_2010_2000 AS + SELECT c2010.geo_name, + c2010.state_us_abbreviation AS st, + c2010.state_fips, + c2010.county_fips, + c2010.p0010001 AS pop_2010, + c2000.p0010001 AS pop_2000, + round( (CAST(c2010.p0010001 AS numeric(8,1)) - c2000.p0010001) + / c2000.p0010001 * 100, 1 ) AS pct_change_2010_2000 + FROM us_counties_2010 c2010 INNER JOIN us_counties_2000 c2000 + ON c2010.state_fips = c2000.state_fips + AND c2010.county_fips = c2000.county_fips + ORDER BY c2010.state_fips, c2010.county_fips; + +-- Listing 15-4: Selecting columns from the county_pop_change_2010_2000 view + +SELECT geo_name, + st, + pop_2010, + pct_change_2010_2000 +FROM county_pop_change_2010_2000 +WHERE st = 'NV' +LIMIT 5; + +-- Listing 15-5: Creating a view on the employees table + +CREATE OR REPLACE VIEW employees_tax_dept AS + SELECT emp_id, + first_name, + last_name, + dept_id + FROM employees + WHERE dept_id = 1 + ORDER BY emp_id + WITH LOCAL CHECK OPTION; + +SELECT * FROM employees_tax_dept; + +-- Listing 15-6: Successful and rejected inserts via the employees_tax_dept view + +INSERT INTO employees_tax_dept (first_name, last_name, dept_id) +VALUES ('Suzanne', 'Legere', 1); + +INSERT INTO employees_tax_dept (first_name, last_name, dept_id) +VALUES ('Jamil', 'White', 2); + +-- optional: +SELECT * FROM employees_tax_dept; + +SELECT * FROM employees; + +-- Listing 15-7: Updating a row via the employees_tax_dept view + +UPDATE employees_tax_dept +SET last_name = 'Le Gere' +WHERE emp_id = 5; + +SELECT * FROM employees_tax_dept; + +-- Bonus: This will fail because the salary column is not in the view +UPDATE employees_tax_dept +SET salary = 100000 +WHERE emp_id = 5; + +-- Listing 15-8: Deleting a row via the employees_tax_dept view + +DELETE FROM employees_tax_dept +WHERE emp_id = 5; + + +-- FUNCTIONS +-- https://www.postgresql.org/docs/current/static/plpgsql.html + +-- Listing 15-9: Creating a percent_change function +-- To delete this function: DROP FUNCTION percent_change(numeric,numeric,integer); + +CREATE OR REPLACE FUNCTION +percent_change(new_value numeric, + old_value numeric, + decimal_places integer DEFAULT 1) +RETURNS numeric AS +'SELECT round( + ((new_value - old_value) / old_value) * 100, decimal_places +);' +LANGUAGE SQL +IMMUTABLE +RETURNS NULL ON NULL INPUT; + +-- Listing 15-10: Testing the percent_change() function + +SELECT percent_change(110, 108, 2); + +-- Listing 15-11: Testing percent_change() on Census data + +SELECT c2010.geo_name, + c2010.state_us_abbreviation AS st, + c2010.p0010001 AS pop_2010, + percent_change(c2010.p0010001, c2000.p0010001) AS pct_chg_func, + round( (CAST(c2010.p0010001 AS numeric(8,1)) - c2000.p0010001) + / c2000.p0010001 * 100, 1 ) AS pct_chg_formula +FROM us_counties_2010 c2010 INNER JOIN us_counties_2000 c2000 +ON c2010.state_fips = c2000.state_fips + AND c2010.county_fips = c2000.county_fips +ORDER BY pct_chg_func DESC +LIMIT 5; + +-- Listing 15-12: Adding a column to the teachers table and seeing the data + +ALTER TABLE teachers ADD COLUMN personal_days integer; + +SELECT first_name, + last_name, + hire_date, + personal_days +FROM teachers; + +-- Listing 15-13: Creating an update_personal_days() function + +CREATE OR REPLACE FUNCTION update_personal_days() +RETURNS void AS $$ +BEGIN + UPDATE teachers + SET personal_days = + CASE WHEN (now() - hire_date) BETWEEN '5 years'::interval + AND '10 years'::interval THEN 4 + WHEN (now() - hire_date) > '10 years'::interval THEN 5 + ELSE 3 + END; + RAISE NOTICE 'personal_days updated!'; +END; +$$ LANGUAGE plpgsql; + +-- To run the function: +SELECT update_personal_days(); + +-- Listing 15-14: Enabling the PL/Python procedural language + +CREATE EXTENSION plpythonu; + +-- Listing 15-15: Using PL/Python to create the trim_county() function + +CREATE OR REPLACE FUNCTION trim_county(input_string text) +RETURNS text AS $$ + import re + cleaned = re.sub(r' County', '', input_string) + return cleaned +$$ LANGUAGE plpythonu; + +-- Listing 15-16: Testing the trim_county() function + +SELECT geo_name, + trim_county(geo_name) +FROM us_counties_2010 +ORDER BY state_fips, county_fips +LIMIT 5; + + +-- TRIGGERS + +-- Listing 15-17: Creating the grades and grades_history tables + +CREATE TABLE grades ( + student_id bigint, + course_id bigint, + course varchar(30) NOT NULL, + grade varchar(5) NOT NULL, +PRIMARY KEY (student_id, course_id) +); + +INSERT INTO grades +VALUES + (1, 1, 'Biology 2', 'F'), + (1, 2, 'English 11B', 'D'), + (1, 3, 'World History 11B', 'C'), + (1, 4, 'Trig 2', 'B'); + +CREATE TABLE grades_history ( + student_id bigint NOT NULL, + course_id bigint NOT NULL, + change_time timestamp with time zone NOT NULL, + course varchar(30) NOT NULL, + old_grade varchar(5) NOT NULL, + new_grade varchar(5) NOT NULL, +PRIMARY KEY (student_id, course_id, change_time) +); + +-- Listing 15-18: Creating the record_if_grade_changed() function + +CREATE OR REPLACE FUNCTION record_if_grade_changed() + RETURNS trigger AS +$$ +BEGIN + IF NEW.grade <> OLD.grade THEN + INSERT INTO grades_history ( + student_id, + course_id, + change_time, + course, + old_grade, + new_grade) + VALUES + (OLD.student_id, + OLD.course_id, + now(), + OLD.course, + OLD.grade, + NEW.grade); + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Listing 15-19: Creating the grades_update trigger + +CREATE TRIGGER grades_update + AFTER UPDATE + ON grades + FOR EACH ROW + EXECUTE PROCEDURE record_if_grade_changed(); + +-- Listing 15-20: Testing the grades_update trigger + +-- Initially, there should be 0 records in the history +SELECT * FROM grades_history; + +-- Check the grades +SELECT * FROM grades; + +-- Update a grade +UPDATE grades +SET grade = 'C' +WHERE student_id = 1 AND course_id = 1; + +-- Now check the history +SELECT student_id, + change_time, + course, + old_grade, + new_grade +FROM grades_history; + +-- Listing 15-21: Creating a temperature_test table + +CREATE TABLE temperature_test ( + station_name varchar(50), + observation_date date, + max_temp integer, + min_temp integer, + max_temp_group varchar(40), +PRIMARY KEY (station_name, observation_date) +); + +-- Listing 15-22: Creating the classify_max_temp() function + +CREATE OR REPLACE FUNCTION classify_max_temp() + RETURNS trigger AS +$$ +BEGIN + CASE + WHEN NEW.max_temp >= 90 THEN + NEW.max_temp_group := 'Hot'; + WHEN NEW.max_temp BETWEEN 70 AND 89 THEN + NEW.max_temp_group := 'Warm'; + WHEN NEW.max_temp BETWEEN 50 AND 69 THEN + NEW.max_temp_group := 'Pleasant'; + WHEN NEW.max_temp BETWEEN 33 AND 49 THEN + NEW.max_temp_group := 'Cold'; + WHEN NEW.max_temp BETWEEN 20 AND 32 THEN + NEW.max_temp_group := 'Freezing'; + ELSE NEW.max_temp_group := 'Inhumane'; + END CASE; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Listing 15-23: Creating the temperature_insert trigger + +CREATE TRIGGER temperature_insert + BEFORE INSERT + ON temperature_test + FOR EACH ROW + EXECUTE PROCEDURE classify_max_temp(); + +-- Listing 15-24: Inserting rows to test the temperature_update trigger + +INSERT INTO temperature_test (station_name, observation_date, max_temp, min_temp) +VALUES + ('North Station', '1/19/2019', 10, -3), + ('North Station', '3/20/2019', 28, 19), + ('North Station', '5/2/2019', 65, 42), + ('North Station', '8/9/2019', 93, 74); + +SELECT * FROM temperature_test; diff --git a/Chapter_19/Chapter_19.sql b/Chapter_19/Chapter_19.sql new file mode 100644 index 0000000..db7d99e --- /dev/null +++ b/Chapter_19/Chapter_19.sql @@ -0,0 +1,104 @@ +-------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data +-- by Anthony DeBarros + +-- Chapter 17 Code Examples +-------------------------------------------------------------- + +-- VACUUM + +-- Listing 17-1: Creating a table to test vacuuming + +CREATE TABLE vacuum_test ( + integer_column integer +); + +-- Listing 17-2: Determining the size of vacuum_test + +SELECT pg_size_pretty( + pg_total_relation_size('vacuum_test') + ); + +-- optional: Determine database size +SELECT pg_size_pretty( + pg_database_size('analysis') + ); + +-- Listing 17-3: Inserting 500,000 rows into vacuum_test + +INSERT INTO vacuum_test +SELECT * FROM generate_series(1,500000); + +-- Test its size again +SELECT pg_size_pretty( + pg_table_size('vacuum_test') + ); + +-- Listing 17-4: Updating all rows in vacuum_test + +UPDATE vacuum_test +SET integer_column = integer_column + 1; + +-- Test its size again (35 MB) +SELECT pg_size_pretty( + pg_table_size('vacuum_test') + ); + +-- Listing 17-5: Viewing autovacuum statistics for vacuum_test + +SELECT relname, + last_vacuum, + last_autovacuum, + vacuum_count, + autovacuum_count +FROM pg_stat_all_tables +WHERE relname = 'vacuum_test'; + +-- To see all columns available +SELECT * +FROM pg_stat_all_tables +WHERE relname = 'vacuum_test'; + +-- Listing 17-6: Running VACUUM manually + +VACUUM vacuum_test; + +VACUUM; -- vacuums the whole database + +VACUUM VERBOSE; -- provides messages + +-- Listing 17-7: Using VACUUM FULL to reclaim disk space + +VACUUM FULL vacuum_test; + +-- Test its size again +SELECT pg_size_pretty( + pg_table_size('vacuum_test') + ); + +-- SETTINGS + +-- Listing 17-8: Showing the location of postgresql.conf + +SHOW config_file; + +-- Listing 17-10: Show the location of the data directory + +SHOW data_directory; + +-- reload settings +-- Mac and Linux: pg_ctl reload -D '/path/to/data/directory/' +-- Windows: pg_ctl reload -D "C:\path\to\data\directory\" + + +-- BACKUP AND RESTORE + +-- Listing 17-11: Backing up the analysis database with pg_dump +pg_dump -d analysis -U [user_name] -Fc > analysis_backup.sql + +-- Back up just a table +pg_dump -t 'train_rides' -d analysis -U [user_name] -Fc > train_backup.sql + +-- Listing 17-12: Restoring the analysis database with pg_restore + +pg_restore -C -d postgres -U postgres analysis_backup_custom.sql