Chapter 14 first revision

2021-01-11 21:52:09 -05:00 · 2021-01-11 21:52:09 -05:00 · 602ac5e580
commit 602ac5e580
parent 1a389ccd3f
4 changed files with 8234 additions and 4724 deletions
--- a/Chapter_14/Chapter_14.sql
+++ b/Chapter_14/Chapter_14.sql
@ -1,14 +1,10 @@
-- FIRST EDITION FILE; IGNORE
-
-
-
-
--------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data
+---------------------------------------------------------------------------
+-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
 -- by Anthony DeBarros

 -- Chapter 14 Code Examples
--------------------------------------------------------------
+----------------------------------------------------------------------------
+

 -- Commonly used string functions
 -- Full list at https://www.postgresql.org/docs/current/static/functions-string.html
@ -39,70 +35,100 @@ SELECT right('703-555-1212', 8);
 SELECT replace('bat', 'b', 'c');


-- Table 14-2: Regular Expression Matching Examples
+-- Table 14-1: Regular Expression Matching Examples

 -- Any character one or more times
-SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '.+');
+SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '.+');
 -- One or two digits followed by a space and p.m.
-SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\d{1,2} (?:a.m.|p.m.)');
+SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\d{1,2} (?:a.m.|p.m.)');
 -- One or more word characters at the start
-SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '^\w+');
+SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '^\w+');
 -- One or more word characters followed by any character at the end.
-SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\w+.$');
+SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\w+.$');
 -- The words May or June
-SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from 'May|June');
+SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from 'May|June');
 -- Four digits
-SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\d{4}');
+SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\d{4}');
 -- May followed by a space, digit, comma, space, and four digits.
-SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from 'May \d, \d{4}');
+SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from 'May \d, \d{4}');
+
+-- Listing 14-1: Using regular expressions in a WHERE clause
+
+SELECT county_name
+FROM us_counties_pop_est_2019
+WHERE county_name ~* '(.+lade.+|.+lare.+)'
+ORDER BY county_name;
+
+SELECT county_name
+FROM us_counties_pop_est_2019
+WHERE county_name ~* '.+ash.+' AND county_name !~ 'Wash.+'
+ORDER BY county_name;
+
+
+-- Listing 14-2: Regular expression functions to replace and split
+
+SELECT regexp_replace('05/12/2024', '\d{4}', '2023');
+
+SELECT regexp_split_to_table('Four,score,and,seven,years,ago', ',');
+
+SELECT regexp_split_to_array('Phil Mike Tony Steve', ' ');
+
+-- Listing 14-3: Finding an array length
+
+SELECT array_length(regexp_split_to_array('Phil Mike Tony Steve', ' '), 1);


 -- Turning Text to Data with Regular Expression Functions

-- Listing 14-2: Creating and loading the crime_reports table
+-- Listing 14-5: Creating and loading the crime_reports table
 -- Data from https://sheriff.loudoun.gov/dailycrime

 CREATE TABLE crime_reports (
-    crime_id bigserial PRIMARY KEY,
+    crime_id integer PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
+    case_number text,
    date_1 timestamp with time zone,
    date_2 timestamp with time zone,
-    street varchar(250),
-    city varchar(100),
-    crime_type varchar(100),
+    street text,
+    city text,
+    crime_type text,
    description text,
-    case_number varchar(50),
    original_text text NOT NULL
 );

 COPY crime_reports (original_text)
-FROM 'C:\YourDirectory\crime_reports.csv'
+-- FROM 'C:\YourDirectory\crime_reports.csv'
+FROM '/Users/adebarros/Dropbox/DataMonky/Book-Writing/PracticalSQL_2e/Code-Repo/Chapter_14/crime_reports.csv'
 WITH (FORMAT CSV, HEADER OFF, QUOTE '"');

 SELECT original_text FROM crime_reports;

-- Listing 14-3: Using regexp_match() to find the first date
+-- Listing 14-6: Using regexp_match() to find the first date
 SELECT crime_id,
       regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}')
-FROM crime_reports;
+FROM crime_reports
+ORDER BY crime_id;

-- Listing 14-4: Using the regexp_matches() function with the 'g' flag
+-- Listing 14-7: Using the regexp_matches() function with the 'g' flag
 SELECT crime_id,
       regexp_matches(original_text, '\d{1,2}\/\d{1,2}\/\d{2}', 'g')
-FROM crime_reports;
+FROM crime_reports
+ORDER BY crime_id;

-- Listing 14-5: Using regexp_match() to find the second date
+-- Listing 14-8: Using regexp_match() to find the second date
 -- Note that the result includes an unwanted hyphen
 SELECT crime_id,
       regexp_match(original_text, '-\d{1,2}\/\d{1,2}\/\d{1,2}')
-FROM crime_reports;
+FROM crime_reports
+ORDER BY crime_id;

-- Listing 14-6: Using a capture group to return only the date
+-- Listing 14-9: Using a capture group to return only the date
 -- Eliminates the hyphen
 SELECT crime_id,
       regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})')
-FROM crime_reports;
+FROM crime_reports
+ORDER BY crime_id;

-- Listing 14-7: Matching case number, date, crime type, and city
+-- Listing 14-10: Matching case number, date, crime type, and city

 SELECT
    regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number,
@ -110,7 +136,8 @@ SELECT
    regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type,
    regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n')
        AS city
-FROM crime_reports;
+FROM crime_reports
+ORDER BY crime_id;

 -- Bonus: Get all parsed elements at once

@ -130,17 +157,19 @@ SELECT crime_id,
       regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type,
       regexp_match(original_text, ':\s(.+)(?:C0|SO)') AS description,
       regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number
-FROM crime_reports;
+FROM crime_reports
+ORDER BY crime_id;

-- Listing 14-8: Retrieving a value from within an array
+-- Listing 14-11: Retrieving a value from within an array

 SELECT
    crime_id,
    (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1]
        AS case_number
-FROM crime_reports;
+FROM crime_reports
+ORDER BY crime_id;

-- Listing 14-9: Updating the crime_reports date_1 column
+-- Listing 14-12: Updating the crime_reports date_1 column

 UPDATE crime_reports
 SET date_1 = 
@ -149,14 +178,10 @@ SET date_1 =
        || ' ' ||
    (regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1] 
        ||' US/Eastern'
-)::timestamptz;
+)::timestamptz
+RETURNING crime_id, date_1, original_text;

-SELECT crime_id,
-       date_1,
-       original_text
-FROM crime_reports;
-
-- Listing 14-10: Updating all crime_reports columns
+-- Listing 14-13: Updating all crime_reports columns

 UPDATE crime_reports
 SET date_1 = 
@ -198,38 +223,14 @@ SET date_1 =
    description = (regexp_match(original_text, ':\s(.+)(?:C0|SO)'))[1],
    case_number = (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1];

-- Listing 14-11: Viewing selected crime data
+-- Listing 14-14: Viewing selected crime data

 SELECT date_1,
       street,
       city,
       crime_type
-FROM crime_reports;
-
-- Listing 14-12: Using regular expressions in a WHERE clause
-
-SELECT geo_name
-FROM us_counties_2010
-WHERE geo_name ~* '(.+lade.+|.+lare.+)'
-ORDER BY geo_name;
-
-SELECT geo_name
-FROM us_counties_2010
-WHERE geo_name ~* '.+ash.+' AND geo_name !~ 'Wash.+'
-ORDER BY geo_name;
-
-
-- Listing 14-13: Regular expression functions to replace and split
-
-SELECT regexp_replace('05/12/2018', '\d{4}', '2017');
-
-SELECT regexp_split_to_table('Four,score,and,seven,years,ago', ',');
-
-SELECT regexp_split_to_array('Phil Mike Tony Steve', ' ');
-
-- Listing 14-14: Finding an array length
-
-SELECT array_length(regexp_split_to_array('Phil Mike Tony Steve', ' '), 1);
+FROM crime_reports
+ORDER BY crime_id;


 -- FULL TEXT SEARCH
@ -261,16 +262,17 @@ SELECT to_tsvector('I am walking across the sitting room') @@ to_tsquery('walkin
 -- https://www.eisenhower.archives.gov/all_about_ike/speeches.html

 CREATE TABLE president_speeches (
-    sotu_id serial PRIMARY KEY,
-    president varchar(100) NOT NULL,
-    title varchar(250) NOT NULL,
+    president text NOT NULL,
+    title text NOT NULL,
    speech_date date NOT NULL,
    speech_text text NOT NULL,
-    search_speech_text tsvector
+    search_speech_text tsvector,
+    CONSTRAINT speech_key PRIMARY KEY (president, speech_date)
 );

 COPY president_speeches (president, title, speech_date, speech_text)
-FROM 'C:\YourDirectory\sotu-1946-1977.csv'
+-- FROM 'C:\YourDirectory\president_speeches.csv'
+FROM '/Users/adebarros/Dropbox/DataMonky/Book-Writing/PracticalSQL_2e/Code-Repo/Chapter_14/president_speeches.csv'
 WITH (FORMAT CSV, DELIMITER '|', HEADER OFF, QUOTE '@');

 SELECT * FROM president_speeches;
@ -295,14 +297,15 @@ ORDER BY speech_date;

 SELECT president,
       speech_date,
-       ts_headline(speech_text, to_tsquery('Vietnam'),
+       ts_headline(speech_text, to_tsquery('tax'),
                   'StartSel = <,
                    StopSel = >,
                    MinWords=5,
                    MaxWords=7,
                    MaxFragments=1')
 FROM president_speeches
-WHERE search_speech_text @@ to_tsquery('Vietnam');
+WHERE search_speech_text @@ to_tsquery('tax');
+

 -- Listing 14-23: Finding speeches with the word "transportation" but not "roads"

--- a/Chapter_14/president_speeches.csv
+++ b/Chapter_14/president_speeches.csv
--- a/Chapter_14/sotu-1946-1977.csv
+++ b/Chapter_14/sotu-1946-1977.csv
--- a/Try_It_Yourself/Try_It_Yourself.sql
+++ b/Try_It_Yourself/Try_It_Yourself.sql
@ -867,3 +867,76 @@ AS (flavor text,
    midtown bigint,
    uptown bigint);

+
+----------------------------------------------------------------------------
+-- Chapter 14: Mining Text to Find Meaningful Data
+----------------------------------------------------------------------------
+
+-- 1. The style guide of a publishing company you're writing for wants you to
+-- avoid commas before suffixes in names. But there are several names like
+-- Alvarez, Jr. and Williams, Sr. in your author database. Which functions can
+-- you use to remove the comma? Would a regular expression function help?
+-- How would you capture just the suffixes to place them into a separate column?
+
+-- Answer: You can use either the standard SQL replace() function or the
+-- PostgreSQL regexp_replace() function:
+
+SELECT replace('Williams, Sr.', ', ', ' ');
+SELECT regexp_replace('Williams, Sr.', ', ', ' ');
+
+-- Answer: To capture just the suffixes, search for characters after a comma
+-- and space and place those inside a match group:
+
+SELECT (regexp_match('Williams, Sr.', '.*, (.*)'))[1];
+
+
+-- 2. Using any one of the presidents' speeches addresses, count the number of
+-- unique words that are five characters or more. Hint: you can use
+-- regexp_split_to_table() in a subquery to create a table of words to count.
+-- Bonus: remove commas and periods at the end of each word.
+
+-- Answer:
+-- This query uses a Common Table Expression to first separate each word
+-- in the text into a separate row in a table named word_list. Then the SELECT
+-- statement counts the words, which are cleaned up with two operations. First,
+-- several nested replace functions remove commas, periods, and colons. Second,
+-- all words are converted to lowercase so that when we count we group words
+-- that may appear with various cases (e.g., "Military" and "military").
+
+WITH
+    word_list (word)
+AS
+    (
+        SELECT regexp_split_to_table(speech_text, '\s') AS word
+        FROM president_speeches
+        WHERE speech_date = '1946-01-21'
+    )
+
+SELECT lower(
+               replace(replace(replace(word, ',', ''), '.', ''), ':', '')
+             ) AS cleaned_word,
+       count(*)
+FROM word_list
+WHERE length(word) >= 5
+GROUP BY cleaned_word
+ORDER BY count(*) DESC;
+
+
+-- 3. Rewrite the query in Listing 14-25 using the ts_rank_cd() function
+-- instead of ts_rank(). According to th PostgreSQL documentation, ts_rank_cd()
+-- computes cover density, which takes into account how close the lexeme search
+-- terms are to each other. Does using the ts_rank_cd() function significantly
+-- change the results?
+
+-- Answer:
+-- The ranking does change, highlighting post-9/11 speeches by George W. Bush.
+-- The change might be more or less pronounced given another set of texts.
+
+SELECT president,
+       speech_date,
+       ts_rank_cd(search_speech_text, search_query, 2) AS rank_score
+FROM president_speeches,
+     to_tsquery('war & security & threat & enemy') search_query
+WHERE search_speech_text @@ search_query
+ORDER BY rank_score DESC
+LIMIT 5;