Remove first edition code from Try it Yourself file

This commit is contained in:
anthonydb 2020-07-26 15:40:46 -04:00
parent ef1218d773
commit 7e2bcf3cc7

View File

@ -5,7 +5,11 @@
-- Try It Yourself Questions and Answers
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-- Chapter 1: Setting Up Your Coding Environment
----------------------------------------------------------------------------
-- There are no Try It Yourself exercises in this chapter!
----------------------------------------------------------------------------
-- Chapter 2: Creating Your First Database and Table
@ -305,827 +309,4 @@ GROUP BY state_name;
--------------------------------------------------------------
-- Chapter 7: Joining Tables in a Relational Database
--------------------------------------------------------------
-- 1. The table us_counties_2010 contains 3,143 rows, and us_counties_2000 has
-- 3,141. That reflects the ongoing adjustments to county-level geographies that
-- typically result from government decision making. Using appropriate joins and
-- the NULL value, identify which counties don't exist in both tables. For fun,
-- search online to nd out why theyre missing
-- Answers:
-- Counties that exist in 2010 data but not 2000 include five county equivalents
-- in Alaska (called boroughs) plus Broomfield County, Colorado.
SELECT c2010.geo_name,
c2010.state_us_abbreviation,
c2000.geo_name
FROM us_counties_2010 c2010 LEFT JOIN us_counties_2000 c2000
ON c2010.state_fips = c2000.state_fips
AND c2010.county_fips = c2000.county_fips
WHERE c2000.geo_name IS NULL;
-- Counties that exist in 2000 data but not 2010 include three county
-- equivalents in Alaska (called boroughs) plus Clifton Forge city, Virginia,
-- which gave up its independent city status in 2001:
SELECT c2010.geo_name,
c2000.geo_name,
c2000.state_us_abbreviation
FROM us_counties_2010 c2010 RIGHT JOIN us_counties_2000 c2000
ON c2010.state_fips = c2000.state_fips
AND c2010.county_fips = c2000.county_fips
WHERE c2010.geo_name IS NULL;
-- 2. Using either the median() or percentile_cont() functions in Chapter 5,
-- determine the median of the percent change in county population.
-- Answer: 3.2%
-- Using median():
SELECT median(round( (CAST(c2010.p0010001 AS numeric(8,1)) - c2000.p0010001)
/ c2000.p0010001 * 100, 1 )) AS median_pct_change
FROM us_counties_2010 c2010 INNER JOIN us_counties_2000 c2000
ON c2010.state_fips = c2000.state_fips
AND c2010.county_fips = c2000.county_fips;
-- Using percentile_cont():
SELECT percentile_cont(.5)
WITHIN GROUP (ORDER BY round( (CAST(c2010.p0010001 AS numeric(8,1)) - c2000.p0010001)
/ c2000.p0010001 * 100, 1 )) AS percentile_50th
FROM us_counties_2010 c2010 INNER JOIN us_counties_2000 c2000
ON c2010.state_fips = c2000.state_fips
AND c2010.county_fips = c2000.county_fips;
-- Note: In both examples, you're finding the median of all the
-- county population percent change values.
-- 3. Which county had the greatest percentage loss of population between 2000
-- and 2010? Do you have any idea why? Hint: a weather event happened in 2005.
-- Answer: St. Bernard Parish, La. It and other Louisiana parishes (the county
-- equivalent name in Louisiana) experienced substantial population loss
-- following Hurricane Katrina in 2005.
SELECT c2010.geo_name,
c2010.state_us_abbreviation,
c2010.p0010001 AS pop_2010,
c2000.p0010001 AS pop_2000,
c2010.p0010001 - c2000.p0010001 AS raw_change,
round( (CAST(c2010.p0010001 AS DECIMAL(8,1)) - c2000.p0010001)
/ c2000.p0010001 * 100, 1 ) AS pct_change
FROM us_counties_2010 c2010 INNER JOIN us_counties_2000 c2000
ON c2010.state_fips = c2000.state_fips
AND c2010.county_fips = c2000.county_fips
ORDER BY pct_change ASC;
--------------------------------------------------------------
-- Chapter 8: Table Design that Works for You
--------------------------------------------------------------
-- Consider the following two tables from a database youre making to keep
-- track of your vinyl LP collection. Start by reviewing these CREATE TABLE
-- statements.
-- The albums table includes information specific to the overall collection
-- of songs on the disc. The songs table catalogs each track on the album.
-- Each song has a title and its own artist column, because each song might.
-- feature its own collection of artists.
CREATE TABLE albums (
album_id bigserial,
album_catalog_code varchar(100),
album_title text,
album_artist text,
album_time interval,
album_release_date date,
album_genre varchar(40),
album_description text
);
CREATE TABLE songs (
song_id bigserial,
song_title text,
song_artist text,
album_id bigint
);
-- Use the tables to answer these questions:
-- 1. Modify these CREATE TABLE statements to include primary and foreign keys
-- plus additional constraints on both tables. Explain why you made your
-- choices.
CREATE TABLE albums (
album_id bigserial,
album_catalog_code varchar(100) NOT NULL,
album_title text NOT NULL,
album_artist text NOT NULL,
album_release_date date,
album_genre varchar(40),
album_description text,
CONSTRAINT album_id_key PRIMARY KEY (album_id),
CONSTRAINT release_date_check CHECK (album_release_date > '1/1/1925')
);
CREATE TABLE songs (
song_id bigserial,
song_title text NOT NULL,
song_artist text NOT NULL,
album_id bigint REFERENCES albums (album_id),
CONSTRAINT song_id_key PRIMARY KEY (song_id)
);
-- Answers:
-- a) Both tables get a primary key using surrogate key id values that are
-- auto-generated via serial data types.
-- b) The songs table references albums via a foreign key constraint.
-- c) In both tables, the title and artist columns cannot be empty, which
-- is specified via a NOT NULL constraint. We assume that every album and
-- song should at minimum have that information.
-- d) In albums, the album_release_date column has a CHECK constraint
-- because it would be likely impossible for us to own an LP made before 1925.
-- 2. Instead of using album_id as a surrogate key for your primary key, are
-- there any columns in albums that could be useful as a natural key? What would
-- you have to know to decide?
-- Answer:
-- We could consider the album_catalog_code. We would have to answer yes to
-- these questions:
-- - Is it going to be unique across all albums released by all companies?
-- - Will we always have one?
-- 3. To speed up queries, which columns are good candidates for indexes?
-- Answer:
-- Primary key columns get indexes by default, but we should add an index
-- to the album_id foreign key column in the songs table because we'll use
-- it in table joins. It's likely that we'll query these tables to search
-- by titles and artists, so those columns in both tables should get indexes
-- too. The album_release_date in albums also is a candidate if we expect
-- to perform many queries that include date ranges.
----------------------------------------------------------------
-- Chapter 9: Extracting Information by Grouping and Summarizing
----------------------------------------------------------------
-- 1. We saw that library visits have declined in most places. But what is the
-- pattern in the use of technology in libraries? Both the 2014 and 2009 library
-- survey tables contain the columns gpterms (the number of internet-connected
-- computers used by the public) and pitusr (uses of public internet computers
-- per year). Modify the code in Listing 8-13 to calculate the percent change in
-- the sum of each column over time. Watch out for negative values!
-- Answer:
-- Use sum() on gpterms (computer terminals) by state, find percent change, and
-- then sort.
SELECT pls14.stabr,
sum(pls14.gpterms) AS gpterms_2014,
sum(pls09.gpterms) AS gpterms_2009,
round( (CAST(sum(pls14.gpterms) AS decimal(10,1)) - sum(pls09.gpterms)) /
sum(pls09.gpterms) * 100, 2 ) AS pct_change
FROM pls_fy2014_pupld14a pls14 JOIN pls_fy2009_pupld09a pls09
ON pls14.fscskey = pls09.fscskey
WHERE pls14.gpterms >= 0 AND pls09.gpterms >= 0
GROUP BY pls14.stabr
ORDER BY pct_change DESC;
-- The query results show a consistent increase in the number of internet
-- computers used by the public in most states.
-- Use sum() on pitusr (uses of public internet computers per year) by state,
-- add percent change, and sort.
SELECT pls14.stabr,
sum(pls14.pitusr) AS pitusr_2014,
sum(pls09.pitusr) AS pitusr_2009,
round( (CAST(sum(pls14.pitusr) AS decimal(10,1)) - sum(pls09.pitusr)) /
sum(pls09.pitusr) * 100, 2 ) AS pct_change
FROM pls_fy2014_pupld14a pls14 JOIN pls_fy2009_pupld09a pls09
ON pls14.fscskey = pls09.fscskey
WHERE pls14.pitusr >= 0 AND pls09.pitusr >= 0
GROUP BY pls14.stabr
ORDER BY pct_change DESC;
-- The query results show most states have seen a decrease in the total uses
-- of public internet computers per year.
-- 2. Both library survey tables contain a column called obereg, a two-digit
-- Bureau of Economic Analysis Code that classifies each library agency
-- according to a region of the United States, such as New England, Rocky
-- Mountains, and so on. Just as we calculated the percent change in visits
-- grouped by state, do the same to group percent changes in visits by US
-- regions using obereg. Consult the survey documentation to find the meaning
-- of each region code. For a bonus challenge, create a table with the obereg
-- code as the primary key and the region name as text, and join it to the
-- summary query to group by the region name rather than the code.
-- Answer:
-- a) sum() visits by region.
SELECT pls14.obereg,
sum(pls14.visits) AS visits_2014,
sum(pls09.visits) AS visits_2009,
round( (CAST(sum(pls14.visits) AS decimal(10,1)) - sum(pls09.visits)) /
sum(pls09.visits) * 100, 2 ) AS pct_change
FROM pls_fy2014_pupld14a pls14 JOIN pls_fy2009_pupld09a pls09
ON pls14.fscskey = pls09.fscskey
WHERE pls14.visits >= 0 AND pls09.visits >= 0
GROUP BY pls14.obereg
ORDER BY pct_change DESC;
-- b) Bonus: creating the regions lookup table and adding it to the query.
CREATE TABLE obereg_codes (
obereg varchar(2) CONSTRAINT obereg_key PRIMARY KEY,
region varchar(50)
);
INSERT INTO obereg_codes
VALUES ('01', 'New England (CT ME MA NH RI VT)'),
('02', 'Mid East (DE DC MD NJ NY PA)'),
('03', 'Great Lakes (IL IN MI OH WI)'),
('04', 'Plains (IA KS MN MO NE ND SD)'),
('05', 'Southeast (AL AR FL GA KY LA MS NC SC TN VA WV)'),
('06', 'Soutwest (AZ NM OK TX)'),
('07', 'Rocky Mountains (CO ID MT UT WY)'),
('08', 'Far West (AK CA HI NV OR WA)'),
('09', 'Outlying Areas (AS GU MP PR VI)');
-- sum() visits by region.
SELECT obereg_codes.region,
sum(pls14.visits) AS visits_2014,
sum(pls09.visits) AS visits_2009,
round( (CAST(sum(pls14.visits) AS decimal(10,1)) - sum(pls09.visits)) /
sum(pls09.visits) * 100, 2 ) AS pct_change
FROM pls_fy2014_pupld14a pls14 JOIN pls_fy2009_pupld09a pls09
ON pls14.fscskey = pls09.fscskey
JOIN obereg_codes
ON pls14.obereg = obereg_codes.obereg
WHERE pls14.visits >= 0 AND pls09.visits >= 0
GROUP BY obereg_codes.region
ORDER BY pct_change DESC;
-- 3. Thinking back to the types of joins you learned in Chapter 6, which join
-- type will show you all the rows in both tables, including those without a
-- match? Write such a query and add an IS NULL filter in a WHERE clause to
-- show agencies not included in one or the other table.
-- Answer: a FULL OUTER JOIN will show all rows in both tables.
SELECT pls14.libname, pls14.city, pls14.stabr, pls14.statstru, pls14.c_admin, pls14.branlib,
pls09.libname, pls09.city, pls09.stabr, pls09.statstru, pls09.c_admin, pls09.branlib
FROM pls_fy2014_pupld14a pls14 FULL OUTER JOIN pls_fy2009_pupld09a pls09
ON pls14.fscskey = pls09.fscskey
WHERE pls14.fscskey IS NULL OR pls09.fscskey IS NULL;
-- Note: The IS NULL statements in the WHERE clause limit results to those
-- that do not appear in both tables.
--------------------------------------------------------------
-- Chapter 10: Inspecting and Modifying Data
--------------------------------------------------------------
-- In this exercise, youll turn the meat_poultry_egg_inspect table into useful
-- information. You needed to answer two questions: How many of the companies
-- in the table process meat, and how many process poultry?
-- Create two new columns called meat_processing and poultry_processing. Each
-- can be of the type boolean.
-- Using UPDATE, set meat_processing = TRUE on any row where the activities
-- column contains the text 'Meat Processing'. Do the same update on the
-- poultry_processing column, but this time lookup for the text
-- 'Poultry Processing' in activities.
-- Use the data from the new, updated columns to count how many companies
-- perform each type of activity. For a bonus challenge, count how many
-- companies perform both activities.
-- Answer:
-- a) Add the columns
ALTER TABLE meat_poultry_egg_inspect ADD COLUMN meat_processing boolean;
ALTER TABLE meat_poultry_egg_inspect ADD COLUMN poultry_processing boolean;
SELECT * FROM meat_poultry_egg_inspect; -- view table with new empty columns
-- b) Update the columns
UPDATE meat_poultry_egg_inspect
SET meat_processing = TRUE
WHERE activities ILIKE '%meat processing%'; -- case-insensitive match with wildcards
UPDATE meat_poultry_egg_inspect
SET poultry_processing = TRUE
WHERE activities ILIKE '%poultry processing%'; -- case-insensitive match with wildcards
-- c) view the updated table
SELECT * FROM meat_poultry_egg_inspect;
-- d) Count meat and poultry processors
SELECT count(meat_processing), count(poultry_processing)
FROM meat_poultry_egg_inspect;
-- e) Count those who do both
SELECT count(*)
FROM meat_poultry_egg_inspect
WHERE meat_processing = TRUE AND
poultry_processing = TRUE;
--------------------------------------------------------------
-- Chapter 11: Statistical Functions in SQL
--------------------------------------------------------------
-- 1. In Listing 10-2, the correlation coefficient, or r value, of the
-- variables pct_bachelors_higher and median_hh_income was about .68.
-- Write a query to show the correlation between pct_masters_higher and
-- median_hh_income. Is the r value higher or lower? What might explain
-- the difference?
-- Answer:
-- The r value of pct_bachelors_higher and median_hh_income is about .57, which
-- shows a lower connection between percent master's degree or higher and
-- income than percent bachelor's degree or higher and income. One possible
-- explanation is that attaining a master's degree or higher may have a more
-- incremental impact on earnings than attaining a bachelor's degree.
SELECT
round(
corr(median_hh_income, pct_bachelors_higher)::numeric, 2
) AS bachelors_income_r,
round(
corr(median_hh_income, pct_masters_higher)::numeric, 2
) AS masters_income_r
FROM acs_2011_2015_stats;
-- 2. In the FBI crime data, Which cities with a population of 500,000 or
-- more have the highest rates of motor vehicle thefts (column
-- motor_vehicle_theft)? Which have the highest violent crime rates
-- (column violent_crime)?
-- Answer:
-- a) In 2015, Milwaukee and Albuquerque had the two highest rates of motor
-- vehicle theft:
SELECT
city,
st,
population,
motor_vehicle_theft,
round(
(motor_vehicle_theft::numeric / population) * 100000, 1
) AS vehicle_theft_per_100000
FROM fbi_crime_data_2015
WHERE population >= 500000
ORDER BY vehicle_theft_per_100000 DESC;
-- b) In 2015, Detroit and Memphis had the two highest rates of violent crime.
SELECT
city,
st,
population,
violent_crime,
round(
(violent_crime::numeric / population) * 100000, 1
) AS violent_crime_per_100000
FROM fbi_crime_data_2015
WHERE population >= 500000
ORDER BY violent_crime_per_100000 DESC;
-- 3. As a bonus challenge, revisit the libraries data in the table
-- pls_fy2014_pupld14a in Chapter 8. Rank library agencies based on the rate
-- of visits per 1,000 population (variable popu_lsa), and limit the query to
-- agencies serving 250,000 people or more.
-- Answer:
-- Cuyahoga County Public Library tops the rankings with 12,963 visits per
-- thousand people (or roughly 13 visits per person).
SELECT
libname,
stabr,
visits,
popu_lsa,
round(
(visits::numeric / popu_lsa) * 1000, 1
) AS visits_per_1000,
rank() OVER (ORDER BY (visits::numeric / popu_lsa) * 1000 DESC)
FROM pls_fy2014_pupld14a
WHERE popu_lsa >= 250000;
--------------------------------------------------------------
-- Chapter 12: Working with Dates and Times
--------------------------------------------------------------
-- 1. Using the New York City taxi data, calculate the length of each ride using
-- the pickup and drop-off timestamps. Sort the query results from the longest
-- ride to the shortest. Do you notice anything about the longest or shortest
-- trips that you might want to ask city officials about?
-- Answer: More than 480 of the trips last more than 10 hours, which seems
-- excessive. Moreover, two records have drop-off times before the pickup time,
-- and several have pickup and drop-off times that are the same. It's worth
-- asking whether these records have timestamp errors.
SELECT
trip_id,
tpep_pickup_datetime,
tpep_dropoff_datetime,
tpep_dropoff_datetime - tpep_pickup_datetime AS length_of_ride
FROM nyc_yellow_taxi_trips_2016_06_01
ORDER BY length_of_ride DESC;
-- 2. Using the AT TIME ZONE keywords, write a query that displays the date and
-- time for London, Johannesburg, Moscow, and Melbourne the moment January 1,
-- 2100, arrives in New York City.
-- Answer:
SELECT '2100-01-01 00:00:00-05' AT TIME ZONE 'US/Eastern' AS new_york,
'2100-01-01 00:00:00-05' AT TIME ZONE 'Europe/London' AS london,
'2100-01-01 00:00:00-05' AT TIME ZONE 'Africa/Johannesburg' AS johannesburg,
'2100-01-01 00:00:00-05' AT TIME ZONE 'Europe/Moscow' AS moscow,
'2100-01-01 00:00:00-05' AT TIME ZONE 'Australia/Melbourne' AS melbourne;
-- 3. As a bonus challenge, use the statistics functions in Chapter 10 to
-- calculate the correlation coefficient and r-squared values using trip time
-- and the total_amount column in the New York City taxi data, which represents
-- total amount charged to passengers. Do the same with trip_distance and
-- total_amount. Limit the query to rides that last three hours or less.
-- Answer:
SELECT
round(
corr(total_amount, (
date_part('epoch', tpep_dropoff_datetime) -
date_part('epoch', tpep_pickup_datetime)
))::numeric, 2
) AS amount_time_corr,
round(
regr_r2(total_amount, (
date_part('epoch', tpep_dropoff_datetime) -
date_part('epoch', tpep_pickup_datetime)
))::numeric, 2
) AS amount_time_r2,
round(
corr(total_amount, trip_distance)::numeric, 2
) AS amount_distance_corr,
round(
regr_r2(total_amount, trip_distance)::numeric, 2
) AS amount_distance_r2
FROM nyc_yellow_taxi_trips_2016_06_01
WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval;
-- Note: Both correlations are strong, with r values of 0.80 or higher. We'd
-- expect this given that cost of a taxi ride is based on both time and distance.
--------------------------------------------------------------
-- Chapter 13: Advanced Query Techniques
--------------------------------------------------------------
-- 1. Revise the code in Listing 12-15 to dig deeper into the nuances of
-- Waikikis high temperatures. Limit the temps_collapsed table to the Waikiki
-- maximum daily temperature observations. Then use the WHEN clauses in the
-- CASE statement to reclassify the temperatures into seven groups that would
-- result in the following text output:
-- '90 or more'
-- '88-89'
-- '86-87'
-- '84-85'
-- '82-83'
-- '80-81'
-- '79 or less'
-- In which of those groups does Waikikis daily maximum temperature fall most
-- often?
-- Answer: Between 86 and 87 degrees. Nice.
WITH temps_collapsed (station_name, max_temperature_group) AS
(SELECT station_name,
CASE WHEN max_temp >= 90 THEN '90 or more'
WHEN max_temp BETWEEN 88 AND 89 THEN '88-89'
WHEN max_temp BETWEEN 86 AND 87 THEN '86-87'
WHEN max_temp BETWEEN 84 AND 85 THEN '84-85'
WHEN max_temp BETWEEN 82 AND 83 THEN '82-83'
WHEN max_temp BETWEEN 80 AND 81 THEN '80-81'
WHEN max_temp <= 79 THEN '79 or less'
END
FROM temperature_readings
WHERE station_name = 'WAIKIKI 717.2 HI US')
SELECT station_name, max_temperature_group, count(*)
FROM temps_collapsed
GROUP BY station_name, max_temperature_group
ORDER BY max_temperature_group;
-- 2. Revise the ice cream survey crosstab in Listing 12-11 to flip the table.
-- In other words, make flavor the rows and office the columns. Which elements
-- of the query do you need to change? Are the counts different?
-- Answer: You need to re-order the columns in the first subquery so flavor is
-- first and office is second. count(*) stays third. Then, you must change
-- the second subquery to produce a grouped list of office. Finally, you must
-- add the office names to the output list.
-- The numbers don't change, just the order presented in the crosstab.
SELECT *
FROM crosstab('SELECT flavor,
office,
count(*)
FROM ice_cream_survey
GROUP BY flavor, office
ORDER BY flavor',
'SELECT office
FROM ice_cream_survey
GROUP BY office
ORDER BY office')
AS (flavor varchar(20),
downtown bigint,
midtown bigint,
uptown bigint);
-------------------------------------------------------------
-- Chapter 14: Mining Text to Find Meaningful Data
--------------------------------------------------------------
-- 1. The style guide of a publishing company you're writing for wants you to
-- avoid commas before suffixes in names. But there are several names like
-- Alvarez, Jr. and Williams, Sr. in your author database. Which functions can
-- you use to remove the comma? Would a regular expression function help?
-- How would you capture just the suffixes to place them into a separate column?
-- Answer: You can use either the standard SQL replace() function or the
-- PostgreSQL regexp_replace() function:
SELECT replace('Williams, Sr.', ', ', ' ');
SELECT regexp_replace('Williams, Sr.', ', ', ' ');
-- Answer: To capture just the suffixes, search for characters after a comma
-- and space and place those inside a match group:
SELECT (regexp_match('Williams, Sr.', '.*, (.*)'))[1];
-- 2. Using any one of the State of the Union addresses, count the number of
-- unique words that are five characters or more. Hint: you can use
-- regexp_split_to_table() in a subquery to create a table of words to count.
-- Bonus: remove commas and periods at the end of each word.
-- Answer:
WITH
word_list (word)
AS
(
SELECT regexp_split_to_table(speech_text, '\s') AS word
FROM president_speeches
WHERE speech_date = '1974-01-30'
)
SELECT lower(
replace(replace(replace(word, ',', ''), '.', ''), ':', '')
) AS cleaned_word,
count(*)
FROM word_list
WHERE length(word) >= 5
GROUP BY cleaned_word
ORDER BY count(*) DESC;
-- Note: This query uses a Common Table Expression to first separate each word
-- in the text into a separate row in a table named word_list. Then the SELECT
-- statement counts the words, which are cleaned up with two operations. First,
-- several nested replace functions remove commas, periods, and colons. Second,
-- all words are converted to lowercase so that when we count we group words
-- that may appear with various cases (e.g., "Military" and "military").
-- 3. Rewrite the query in Listing 13-25 using the ts_rank_cd() function
-- instead of ts_rank(). According to th PostgreSQL documentation, ts_rank_cd()
-- computes cover density, which takes into account how close the lexeme search
-- terms are to each other. Does using the ts_rank_cd() function significantly
-- change the results?
-- Answer:
-- The ranking does change, although the same speeches are generally
-- represented. The change might be more or less pronounced given another set
-- of texts.
SELECT president,
speech_date,
ts_rank_cd(search_speech_text, search_query, 2) AS rank_score
FROM president_speeches,
to_tsquery('war & security & threat & enemy') search_query
WHERE search_speech_text @@ search_query
ORDER BY rank_score DESC
LIMIT 5;
--------------------------------------------------------------
-- Chapter 15: Analyzing Spatial Data with PostGIS
--------------------------------------------------------------
-- 1. Earlier, you found which US county has the largest area. Now,
-- aggregate the county data to find the area of each state in square
-- miles. (Use the statefp10 column in the us_counties_2010_shp table.)
-- How many states are bigger than the Yukon-Koyukuk area?
-- Answer: Just three states are bigger than Yukon-Koyukuk: Of course,
-- one is Alaska itself (FIPS 02). The other two are Texas (FIPS 48),
-- and California (FIPS 06).
SELECT statefp10 AS st,
round (
( sum(ST_Area(geom::geography) / 2589988.110336))::numeric, 2
) AS square_miles
FROM us_counties_2010_shp
GROUP BY statefp10
ORDER BY square_miles DESC;
-- 2. Using ST_Distance(), determine how many miles separate these two farmers
-- markets: the Oakleaf Greenmarket (9700 Argyle Forest Blvd, Jacksonville,
-- Florida) and Columbia Farmers Market (1701 West Ash Street, Columbia,
-- Missouri). Youll need to first find the coordinates for both in the
-- farmers_markets table.
-- Tip: you can also write this query using the Common Table Expression syntax
-- you learned in Chapter 12.
-- Answer: About 851 miles.
WITH
market_start (geog_point) AS
(
SELECT geog_point
FROM farmers_markets
WHERE market_name = 'The Oakleaf Greenmarket'
),
market_end (geog_point) AS
(
SELECT geog_point
FROM farmers_markets
WHERE market_name = 'Columbia Farmers Market'
)
SELECT ST_Distance(market_start.geog_point, market_end.geog_point) / 1609.344 -- convert to meters to miles
FROM market_start, market_end;
-- 3. More than 500 rows in the farmers_markets table are missing a value
-- in the county column, an example of dirty government data. Using the
-- us_counties_2010_shp table and the ST_Intersects() function, perform a
-- spatial join to find the missing county names based on the longitude and
-- latitude of each market. Because geog_point in farmers_markets is of the
-- geography type and its SRID is 4326, youll need to cast geom in the Census
-- table to the geography type and change its SRID using ST_SetSRID().
-- Answer:
SELECT census.name10,
census.statefp10,
markets.market_name,
markets.county,
markets.st
FROM farmers_markets markets JOIN us_counties_2010_shp census
ON ST_Intersects(markets.geog_point, ST_SetSRID(census.geom,4326)::geography)
WHERE markets.county IS NULL
ORDER BY census.statefp10, census.name10;
-- Note that this query also highlights a farmer's market that is mis-geocoded.
-- Can you spot it?
--------------------------------------------------------------
-- Chapter 16: Working with JSON Data
--------------------------------------------------------------
-- To come ...
--------------------------------------------------------------
-- Chapter 17: Saving Time with Views, Functions, and Triggers
--------------------------------------------------------------
-- 1. Create a view that displays the number of New York City taxi trips per
-- hour. Use the taxi data in Chapter 11 and the query in Listing 11-8.
-- Answer:
CREATE VIEW nyc_taxi_trips_per_hour AS
SELECT
date_part('hour', tpep_pickup_datetime),
count(date_part('hour', tpep_pickup_datetime))
FROM nyc_yellow_taxi_trips_2016_06_01
GROUP BY date_part('hour', tpep_pickup_datetime)
ORDER BY date_part('hour', tpep_pickup_datetime);
SELECT * FROM nyc_taxi_trips_per_hour;
-- 2. In Chapter 10, you learned how to calculate rates per thousand. Turn that
-- formula into a rates_per_thousand() function that takes three arguments
-- to calculate the result: observed_number, base_number, and decimal_places.
-- Answer: This uses PL/pgSQL, but you could use a SQL function as well.
CREATE OR REPLACE FUNCTION
rate_per_thousand(observed_number numeric,
base_number numeric,
decimal_places integer DEFAULT 1)
RETURNS numeric(10,2) AS $$
BEGIN
RETURN
round(
(observed_number / base_number) * 1000, decimal_places
);
END;
$$ LANGUAGE plpgsql;
-- Test the function:
SELECT rate_per_thousand(50, 11000, 2);
-- 3. In Chapter 9, you worked with the meat_poultry_egg_inspect table that
-- listed food processing facilities. Write a trigger that automatically adds an
-- inspection date each time you insert a new facility into the table. Use the
-- inspection_date column added in Listing 9-19, and set the date to be six
-- months from the current date. You should be able to describe the steps needed
-- to implement a trigger and how the steps relate to each other.
-- Answer:
-- a) Add the column
ALTER TABLE meat_poultry_egg_inspect ADD COLUMN inspection_date date;
-- b) Create the function that the trigger will execute.
CREATE OR REPLACE FUNCTION add_inspection_date()
RETURNS trigger AS $$
BEGIN
UPDATE meat_poultry_egg_inspect
SET inspection_date = now() + '6 months'::interval; -- Here, we set the inspection date to six months in the future
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- c) Create the trigger
CREATE TRIGGER inspection_date_update
AFTER INSERT
ON meat_poultry_egg_inspect
FOR EACH ROW
EXECUTE PROCEDURE add_inspection_date();
-- d) Test the insertion of a company and examine the result
INSERT INTO meat_poultry_egg_inspect(est_number, company)
VALUES ('test123', 'testcompany');
SELECT * FROM meat_poultry_egg_inspect
WHERE company = 'testcompany';
--------------------------------------------------------------
-- Chapter 18: Using PostgreSQL From the Command Line
--------------------------------------------------------------
-- For this chapter, use psql to review any of the exercises in the book.
--------------------------------------------------------------
-- Chapter 19: Maintaining Your Database
--------------------------------------------------------------
-- To back up the gis_analysis database, use the pg_dump utility at the command line:
-- pg_dump -d gis_analysis -U [your-username] -Fc > gis_analysis_backup_custom.sql
-----------------------------------------------------------------
-- Chapter 20: Identifying and Telling the Story Behind Your Data
-----------------------------------------------------------------
-- This is a non-coding chapter.