311 lines
8.5 KiB
SQL
311 lines
8.5 KiB
SQL
--------------------------------------------------------------
|
|
-- Practical SQL: A Beginner's Guide to Storytelling with Data
|
|
-- by Anthony DeBarros
|
|
|
|
-- Chapter 13 Code Examples
|
|
--------------------------------------------------------------
|
|
|
|
-- Listing 13-1: Using a subquery in a WHERE clause
|
|
|
|
SELECT geo_name,
|
|
state_us_abbreviation,
|
|
p0010001
|
|
FROM us_counties_2010
|
|
WHERE p0010001 >= (
|
|
SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001)
|
|
FROM us_counties_2010
|
|
)
|
|
ORDER BY p0010001 DESC;
|
|
|
|
-- Listing 13-2: Using a subquery in a WHERE clause for DELETE
|
|
|
|
CREATE TABLE us_counties_2010_top10 AS
|
|
SELECT * FROM us_counties_2010;
|
|
|
|
DELETE FROM us_counties_2010_top10
|
|
WHERE p0010001 < (
|
|
SELECT percentile_cont(.9) WITHIN GROUP (ORDER BY p0010001)
|
|
FROM us_counties_2010_top10
|
|
);
|
|
|
|
SELECT count(*) FROM us_counties_2010_top10;
|
|
|
|
-- Listing 13-3: Subquery as a derived table in a FROM clause
|
|
|
|
SELECT round(calcs.average, 0) as average,
|
|
calcs.median,
|
|
round(calcs.average - calcs.median, 0) AS median_average_diff
|
|
FROM (
|
|
SELECT avg(p0010001) AS average,
|
|
percentile_cont(.5)
|
|
WITHIN GROUP (ORDER BY p0010001)::numeric(10,1) AS median
|
|
FROM us_counties_2010
|
|
)
|
|
AS calcs;
|
|
|
|
-- Listing 13-4: Joining two derived tables
|
|
|
|
SELECT census.state_us_abbreviation AS st,
|
|
census.st_population,
|
|
plants.plant_count,
|
|
round((plants.plant_count/census.st_population::numeric(10,1)) * 1000000, 1)
|
|
AS plants_per_million
|
|
FROM
|
|
(
|
|
SELECT st,
|
|
count(*) AS plant_count
|
|
FROM meat_poultry_egg_inspect
|
|
GROUP BY st
|
|
)
|
|
AS plants
|
|
JOIN
|
|
(
|
|
SELECT state_us_abbreviation,
|
|
sum(p0010001) AS st_population
|
|
FROM us_counties_2010
|
|
GROUP BY state_us_abbreviation
|
|
)
|
|
AS census
|
|
ON plants.st = census.state_us_abbreviation
|
|
ORDER BY plants_per_million DESC;
|
|
|
|
-- Listing 13-5: Adding a subquery to a column list
|
|
|
|
SELECT geo_name,
|
|
state_us_abbreviation AS st,
|
|
p0010001 AS total_pop,
|
|
(SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
|
|
FROM us_counties_2010) AS us_median
|
|
FROM us_counties_2010;
|
|
|
|
-- Listing 13-6: Using a subquery expression in a calculation
|
|
|
|
SELECT geo_name,
|
|
state_us_abbreviation AS st,
|
|
p0010001 AS total_pop,
|
|
(SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
|
|
FROM us_counties_2010) AS us_median,
|
|
p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
|
|
FROM us_counties_2010) AS diff_from_median
|
|
FROM us_counties_2010
|
|
WHERE (p0010001 - (SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY p0010001)
|
|
FROM us_counties_2010))
|
|
BETWEEN -1000 AND 1000;
|
|
|
|
|
|
-- BONUS: Subquery expressions
|
|
-- If you'd like to try the IN, EXISTS, and NOT EXISTS expressions on pages 199-200,
|
|
-- here's the code to create a retirees table. The queries below are similar
|
|
-- to the hypothetical examples on pages 199 and 200. You will need the
|
|
-- employees table you created in Chapter 6.
|
|
|
|
-- Create table and insert data
|
|
CREATE TABLE retirees (
|
|
id int,
|
|
first_name varchar(50),
|
|
last_name varchar(50)
|
|
);
|
|
|
|
INSERT INTO retirees
|
|
VALUES (2, 'Lee', 'Smith'),
|
|
(4, 'Janet', 'King');
|
|
|
|
-- Generating values for the IN operator
|
|
SELECT first_name, last_name
|
|
FROM employees
|
|
WHERE emp_id IN (
|
|
SELECT id
|
|
FROM retirees);
|
|
|
|
-- Checking whether values exist (returns all rows from employees
|
|
-- if the expression evaluates as true)
|
|
SELECT first_name, last_name
|
|
FROM employees
|
|
WHERE EXISTS (
|
|
SELECT id
|
|
FROM retirees);
|
|
|
|
-- Using a correlated subquery to find matching values from employees
|
|
-- in retirees.
|
|
SELECT first_name, last_name
|
|
FROM employees
|
|
WHERE EXISTS (
|
|
SELECT id
|
|
FROM retirees
|
|
WHERE id = employees.emp_id);
|
|
|
|
|
|
|
|
-- Listing 13-7: Using a simple CTE to find large counties
|
|
|
|
WITH
|
|
large_counties (geo_name, st, p0010001)
|
|
AS
|
|
(
|
|
SELECT geo_name, state_us_abbreviation, p0010001
|
|
FROM us_counties_2010
|
|
WHERE p0010001 >= 100000
|
|
)
|
|
SELECT st, count(*)
|
|
FROM large_counties
|
|
GROUP BY st
|
|
ORDER BY count(*) DESC;
|
|
|
|
-- Bonus: You can also write this query as:
|
|
SELECT state_us_abbreviation, count(*)
|
|
FROM us_counties_2010
|
|
WHERE p0010001 >= 100000
|
|
GROUP BY state_us_abbreviation
|
|
ORDER BY count(*) DESC;
|
|
|
|
-- Listing 13-8: Using CTEs in a table join
|
|
|
|
WITH
|
|
counties (st, population) AS
|
|
(SELECT state_us_abbreviation, sum(population_count_100_percent)
|
|
FROM us_counties_2010
|
|
GROUP BY state_us_abbreviation),
|
|
|
|
plants (st, plants) AS
|
|
(SELECT st, count(*) AS plants
|
|
FROM meat_poultry_egg_inspect
|
|
GROUP BY st)
|
|
|
|
SELECT counties.st,
|
|
population,
|
|
plants,
|
|
round((plants/population::numeric(10,1))*1000000, 1) AS per_million
|
|
FROM counties JOIN plants
|
|
ON counties.st = plants.st
|
|
ORDER BY per_million DESC;
|
|
|
|
-- Listing 13-9: Using CTEs to minimize redundant code
|
|
|
|
WITH us_median AS
|
|
(SELECT percentile_cont(.5)
|
|
WITHIN GROUP (ORDER BY p0010001) AS us_median_pop
|
|
FROM us_counties_2010)
|
|
|
|
SELECT geo_name,
|
|
state_us_abbreviation AS st,
|
|
p0010001 AS total_pop,
|
|
us_median_pop,
|
|
p0010001 - us_median_pop AS diff_from_median
|
|
FROM us_counties_2010 CROSS JOIN us_median
|
|
WHERE (p0010001 - us_median_pop)
|
|
BETWEEN -1000 AND 1000;
|
|
|
|
|
|
-- Cross tabulations
|
|
-- Install the crosstab() function via the tablefunc module
|
|
|
|
CREATE EXTENSION tablefunc;
|
|
|
|
-- Listing 13-10: Creating and filling the ice_cream_survey table
|
|
|
|
CREATE TABLE ice_cream_survey (
|
|
response_id integer PRIMARY KEY,
|
|
office varchar(20),
|
|
flavor varchar(20)
|
|
);
|
|
|
|
COPY ice_cream_survey
|
|
FROM 'C:\YourDirectory\ice_cream_survey.csv'
|
|
WITH (FORMAT CSV, HEADER);
|
|
|
|
-- Listing 13-11: Generating the ice cream survey crosstab
|
|
|
|
SELECT *
|
|
FROM crosstab('SELECT office,
|
|
flavor,
|
|
count(*)
|
|
FROM ice_cream_survey
|
|
GROUP BY office, flavor
|
|
ORDER BY office',
|
|
|
|
'SELECT flavor
|
|
FROM ice_cream_survey
|
|
GROUP BY flavor
|
|
ORDER BY flavor')
|
|
|
|
AS (office varchar(20),
|
|
chocolate bigint,
|
|
strawberry bigint,
|
|
vanilla bigint);
|
|
|
|
-- Listing 13-12: Creating and filling a temperature_readings table
|
|
|
|
CREATE TABLE temperature_readings (
|
|
reading_id bigserial PRIMARY KEY,
|
|
station_name varchar(50),
|
|
observation_date date,
|
|
max_temp integer,
|
|
min_temp integer
|
|
);
|
|
|
|
COPY temperature_readings
|
|
(station_name, observation_date, max_temp, min_temp)
|
|
FROM 'C:\YourDirectory\temperature_readings.csv'
|
|
WITH (FORMAT CSV, HEADER);
|
|
|
|
-- Listing 13-13: Generating the temperature readings crosstab
|
|
|
|
SELECT *
|
|
FROM crosstab('SELECT
|
|
station_name,
|
|
date_part(''month'', observation_date),
|
|
percentile_cont(.5)
|
|
WITHIN GROUP (ORDER BY max_temp)
|
|
FROM temperature_readings
|
|
GROUP BY station_name,
|
|
date_part(''month'', observation_date)
|
|
ORDER BY station_name',
|
|
|
|
'SELECT month
|
|
FROM generate_series(1,12) month')
|
|
|
|
AS (station varchar(50),
|
|
jan numeric(3,0),
|
|
feb numeric(3,0),
|
|
mar numeric(3,0),
|
|
apr numeric(3,0),
|
|
may numeric(3,0),
|
|
jun numeric(3,0),
|
|
jul numeric(3,0),
|
|
aug numeric(3,0),
|
|
sep numeric(3,0),
|
|
oct numeric(3,0),
|
|
nov numeric(3,0),
|
|
dec numeric(3,0)
|
|
);
|
|
|
|
-- Listing 13-14: Re-classifying temperature data with CASE
|
|
|
|
SELECT max_temp,
|
|
CASE WHEN max_temp >= 90 THEN 'Hot'
|
|
WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm'
|
|
WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant'
|
|
WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold'
|
|
WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing'
|
|
ELSE 'Inhumane'
|
|
END AS temperature_group
|
|
FROM temperature_readings;
|
|
|
|
-- Listing 13-15: Using CASE in a Common Table Expression
|
|
|
|
WITH temps_collapsed (station_name, max_temperature_group) AS
|
|
(SELECT station_name,
|
|
CASE WHEN max_temp >= 90 THEN 'Hot'
|
|
WHEN max_temp BETWEEN 70 AND 89 THEN 'Warm'
|
|
WHEN max_temp BETWEEN 50 AND 69 THEN 'Pleasant'
|
|
WHEN max_temp BETWEEN 33 AND 49 THEN 'Cold'
|
|
WHEN max_temp BETWEEN 20 AND 32 THEN 'Freezing'
|
|
ELSE 'Inhumane'
|
|
END
|
|
FROM temperature_readings)
|
|
|
|
SELECT station_name, max_temperature_group, count(*)
|
|
FROM temps_collapsed
|
|
GROUP BY station_name, max_temperature_group
|
|
ORDER BY station_name, count(*) DESC;
|