practical-sql-2/Chapter_11/Chapter_11.sql
2021-01-05 17:56:46 -05:00

192 lines
5.3 KiB
SQL

---------------------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
-- by Anthony DeBarros
-- Chapter 11 Code Examples
----------------------------------------------------------------------------
-- Listing 11-1: Create Census 2014-2018 ACS 5-Year stats table and import data
CREATE TABLE acs_2014_2018_stats (
geoid text CONSTRAINT geoid_key PRIMARY KEY,
county text NOT NULL,
st text NOT NULL,
pct_travel_60_min numeric(5,2),
pct_bachelors_higher numeric(5,2),
pct_masters_higher numeric(5,2),
median_hh_income integer,
CHECK (pct_masters_higher <= pct_bachelors_higher)
);
COPY acs_2014_2018_stats
FROM 'C:\YourDirectory\acs_2014_2018_stats.csv'
WITH (FORMAT CSV, HEADER);
SELECT * FROM acs_2014_2018_stats;
-- Listing 11-2: Using corr(Y, X) to measure the relationship between
-- education and income
SELECT corr(median_hh_income, pct_bachelors_higher)
AS bachelors_income_r
FROM acs_2014_2018_stats;
-- Listing 11-3: Using corr(Y, X) on additional variables
SELECT
round(
corr(median_hh_income, pct_bachelors_higher)::numeric, 2
) AS bachelors_income_r,
round(
corr(pct_travel_60_min, median_hh_income)::numeric, 2
) AS income_travel_r,
round(
corr(pct_travel_60_min, pct_bachelors_higher)::numeric, 2
) AS bachelors_travel_r
FROM acs_2014_2018_stats;
-- Listing 11-4: Regression slope and intercept functions
SELECT
round(
regr_slope(median_hh_income, pct_bachelors_higher)::numeric, 2
) AS slope,
round(
regr_intercept(median_hh_income, pct_bachelors_higher)::numeric, 2
) AS y_intercept
FROM acs_2014_2018_stats;
-- Listing 11-5: Calculating the coefficient of determination, or r-squared
SELECT round(
regr_r2(median_hh_income, pct_bachelors_higher)::numeric, 3
) AS r_squared
FROM acs_2014_2018_stats;
-- Bonus: Additional stats functions
-- Variance of the entire population
SELECT var_pop(median_hh_income)
FROM acs_2014_2018_stats;
-- Standard deviation of the entire population
SELECT stddev_pop(median_hh_income)
FROM acs_2014_2018_stats;
-- Listing 11-6: The rank() and dense_rank() window functions
CREATE TABLE widget_companies (
id integer PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
company text NOT NULL,
widget_output integer NOT NULL
);
INSERT INTO widget_companies (company, widget_output)
VALUES
('Dom Widgets', 125000),
('Ariadne Widget Masters', 143000),
('Saito Widget Co.', 201000),
('Mal Inc.', 133000),
('Dream Widget Inc.', 196000),
('Miles Amalgamated', 620000),
('Arthur Industries', 244000),
('Fischer Worldwide', 201000);
SELECT
company,
widget_output,
rank() OVER (ORDER BY widget_output DESC),
dense_rank() OVER (ORDER BY widget_output DESC)
FROM widget_companies;
-- Listing 11-7: Applying rank() within groups using PARTITION BY
CREATE TABLE store_sales (
store text NOT NULL,
category text NOT NULL,
unit_sales bigint NOT NULL,
CONSTRAINT store_category_key PRIMARY KEY (store, category)
);
INSERT INTO store_sales (store, category, unit_sales)
VALUES
('Broders', 'Cereal', 1104),
('Wallace', 'Ice Cream', 1863),
('Broders', 'Ice Cream', 2517),
('Cramers', 'Ice Cream', 2112),
('Broders', 'Beer', 641),
('Cramers', 'Cereal', 1003),
('Cramers', 'Beer', 640),
('Wallace', 'Cereal', 980),
('Wallace', 'Beer', 988);
SELECT
category,
store,
unit_sales,
rank() OVER (PARTITION BY category ORDER BY unit_sales DESC)
FROM store_sales;
-- Listing 11-8: Creating and filling a table for Census county business pattern data
CREATE TABLE cbp_naics_72_establishments (
state_fips text,
county_fips text,
county text NOT NULL,
st text NOT NULL,
naics_2017 text NOT NULL,
naics_2017_label text NOT NULL,
year smallint NOT NULL,
establishments integer NOT NULL,
CONSTRAINT cbp_fips_key PRIMARY KEY (state_fips, county_fips)
);
COPY cbp_naics_72_establishments
FROM 'C:\YourDirectory\cbp_naics_72_establishments.csv'
WITH (FORMAT CSV, HEADER);
SELECT * FROM cbp_naics_72_establishments LIMIT 5;
-- Listing 11-9: Finding business rates per thousand population in counties with 50,000 or more people
SELECT
cbp.county,
cbp.st,
cbp.establishments,
pop.pop_est_2018,
round( (cbp.establishments::numeric / pop.pop_est_2018) * 1000, 1 )
AS estabs_per_1000
FROM cbp_naics_72_establishments cbp LEFT JOIN us_counties_pop_est_2019 pop
ON cbp.state_fips = pop.state_fips
AND cbp.county_fips = pop.county_fips
WHERE pop.pop_est_2018 >= 50000
ORDER BY cbp.establishments::numeric / pop.pop_est_2018 DESC;
-- Listing 11-10: Creating a rolling average for export data
CREATE TABLE us_exports (
year smallint,
month smallint,
citrus_export_value bigint,
soybeans_export_value bigint
);
COPY us_exports
FROM 'C:\YourDirectory\us_exports.csv'
WITH (FORMAT CSV, HEADER);
-- View the monthly citrus data
SELECT year, month, citrus_export_value
FROM us_exports
ORDER BY year, month;
-- Calculate rolling average
SELECT year, month, citrus_export_value,
round(
avg(citrus_export_value)
OVER(ORDER BY year, month
ROWS BETWEEN 11 PRECEDING AND CURRENT ROW), 0)
AS twelve_month_avg
FROM us_exports
ORDER BY year, month;