New Chapter 16 code and data on working with JSON

2021-03-27 09:09:47 -04:00 · 2021-03-27 09:09:47 -04:00 · f9a6b17912
commit f9a6b17912
parent 7d9eeb5ece
4 changed files with 13447 additions and 0 deletions
--- a/Chapter_16/Chapter_16.sql
+++ b/Chapter_16/Chapter_16.sql
@ -0,0 +1,421 @@
 ---------------------------------------------------------------------------
 -- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
 -- by Anthony DeBarros
 -- Chapter 16 Code Examples
 ----------------------------------------------------------------------------
 -- Listing 16-1: JSON with information on two films
 [{
 	"title": "The Incredibles",
 	"year": 2004,
 	"rating": {
 		"MPAA": "PG"
 	},
 	"characters": [{
 		"name": "Mr. Incredible",
 		"actor": "Craig T. Nelson"
 	}, {
 		"name": "Elastigirl",
 		"actor": "Holly Hunter"
 	}, {
 		"name": "Frozone",
 		"actor": "Samuel L. Jackson"
 	}],
 	"genre": ["animation", "action", "sci-fi"]
 }, {
 	"title": "Cinema Paradiso",
 	"year": 1988,
 	"characters": [{
 		"name": "Salvatore",
 		"actor": "Salvatore Cascio"
 	}, {
 		"name": "Alfredo",
 		"actor": "Philippe Noiret"
 	}],
 	"genre": ["romance", "drama"]
 }]
 -- Listing 16-2: Creating a table to hold JSON data and adding an index
 CREATE TABLE films (
    id integer GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    film jsonb
 );
 COPY films (film)
 FROM 'C:\YourDirectory\films.json';
 CREATE INDEX idx_film ON films USING GIN (film);
 SELECT * FROM films;
 -- JSON AND JSONB EXTRACTION OPERATORS
 -- Listing 16-3: Retrieving a JSON key value with field extraction operators 
 -- Returns the key value as a JSON data type
 SELECT id, film -> 'title' AS title
 FROM films
 ORDER BY id;
 -- Returns the key value as text
 SELECT id, film ->> 'title' AS title
 FROM films
 ORDER BY id;
 -- Returns the entire array as a JSON data type
 SELECT id, film -> 'genre' AS genre
 FROM films
 ORDER BY id;
 -- Listing 16-4: Retrieving a JSON array value with element extraction operators 
 -- Extracts first element of the JSON array
 -- (array elements are indexed from zero, but negative integers count from the end).
 SELECT id, film -> 'genre' -> 0 AS genres
 FROM films
 ORDER BY id;
 SELECT id, film -> 'genre' -> -1 AS genres
 FROM films
 ORDER BY id;
 SELECT id, film -> 'genre' -> 2 AS genres
 FROM films
 ORDER BY id;
 -- Return the array element as text
 SELECT id, film -> 'genre' ->> 0 AS genres
 FROM films
 ORDER BY id;
 -- Listing 16-5: Retrieving a JSON key value with path extraction operators
 -- Retrieve the film's MPAA rating.
 SELECT id, film #> '{rating, MPAA}' AS mpaa_rating
 FROM films
 ORDER BY id;
 -- Retrieve the name of the first character
 SELECT id, film #> '{characters, 0, name}' AS name
 FROM films
 ORDER BY id;
 -- Same as above but return it as text
 SELECT id, film #>> '{characters, 0, name}' AS name
 FROM films
 ORDER BY id;
 -- JSONB CONTAINMENT AND EXISTENCE OPERATORS
 -- Listing 16-6: Demonstrating containment operators
 -- Does the film JSON value contain the following key/value pair?
 SELECT id, film ->> 'title' AS title,
       film @> '{"title": "The Incredibles"}'::jsonb AS is_incredible
 FROM films
 ORDER BY id;
 -- Using the containment operator in WHERE
 SELECT film ->> 'title' AS title,
       film ->> 'year' AS year
 FROM films
 WHERE film @> '{"title": "The Incredibles"}'::jsonb; 
 -- Is the first JSON value contained in the second?
 SELECT film ->> 'title' AS title,
       film ->> 'year' AS year
 FROM films
 WHERE '{"title": "The Incredibles"}'::jsonb <@ film; 
 -- Listing 16-7: Demonstrating existence operators
 -- Does the text string exist as a top-level key or array element within the JSON value?
 SELECT film ->> 'title' AS title
 FROM films
 WHERE film ? 'rating';
 -- Do any of the strings in the text array exist as top-level keys or array elements?
 SELECT film ->> 'title' AS title,
       film ->> 'rating' AS rating,
       film ->> 'genre' AS genre
 FROM films
 WHERE film ?| '{rating, genre}';
 -- Do all of the strings in the text array exist as top-level keys or array elements?
 SELECT film ->> 'title' AS title,
       film ->> 'rating' AS rating,
       film ->> 'genre' AS genre
 FROM films
 WHERE film ?& '{rating, genre}';
 -- ANALYZING EARTHQUAKE DATA
 -- Listing 16-8: JSON with data on one earthquake
 {
 	"type": "Feature",
 	"properties": {
 		"mag": 1.44,
 		"place": "134 km W of Adak, Alaska",
 		"time": 1612051063470,
 		"updated": 1612139465880,
 		"tz": null,
 		"url": "https://earthquake.usgs.gov/earthquakes/eventpage/av91018173",
 		"detail": "https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=av91018173&format=geojson",
 		"felt": null,
 		"cdi": null,
 		"mmi": null,
 		"alert": null,
 		"status": "reviewed",
 		"tsunami": 0,
 		"sig": 32,
 		"net": "av",
 		"code": "91018173",
 		"ids": ",av91018173,",
 		"sources": ",av,",
 		"types": ",origin,phase-data,",
 		"nst": 10,
 		"dmin": null,
 		"rms": 0.15,
 		"gap": 174,
 		"magType": "ml",
 		"type": "earthquake",
 		"title": "M 1.4 - 134 km W of Adak, Alaska"
 	},
 	"geometry": {
 		"type": "Point",
 		"coordinates": [-178.581, 51.8418333333333, 22.48]
 	},
 	"id": "av91018173"
 }
 -- Listing 16-9: Creating and loading an earthquakes table
 CREATE TABLE earthquakes (
    id integer GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    earthquake jsonb
 );
 COPY earthquakes (earthquake)
 FROM 'C:\YourDirectory\earthquakes.json';
 CREATE INDEX idx_earthquakes ON earthquakes USING GIN (earthquake);
 SELECT * FROM earthquakes;
 -- Listing 16-10: Retrieving the earthquake time
 -- Note that the time is stored in epoch format
 SELECT id, earthquake #>> '{properties, time}' AS time 
 FROM earthquakes
 ORDER BY id LIMIT 5;
 -- Listing 16-11: Converting the time value to a timestamp
 SELECT id, earthquake #>> '{properties, time}' as time,
       to_timestamp(
           (earthquake #>> '{properties, time}')::bigint / 1000
                   ) AS time_formatted
 FROM earthquakes
 ORDER BY id LIMIT 5;
 -- See and set time zone if desired
 SHOW timezone;
 SET timezone TO 'US/Eastern';
 SET timezone TO 'UTC';
 -- Listing 16-12: Finding the minimum and maximum earthquake times
 SELECT min(to_timestamp(
           (earthquake #>> '{properties, time}')::bigint / 1000
                       )) AT TIME ZONE 'UTC' AS min_timestamp,
       max(to_timestamp(
           (earthquake #>> '{properties, time}')::bigint / 1000
                       )) AT TIME ZONE 'UTC' AS max_timestamp
 FROM earthquakes;
 -- Listing 16-13: Finding the five earthquakes with the largest magnitude
 SELECT earthquake #>> '{properties, place}' AS place,
       to_timestamp((earthquake #>> '{properties, time}')::bigint / 1000)
           AT TIME ZONE 'UTC' AS time,
       (earthquake #>> '{properties, mag}')::numeric AS magnitude
 FROM earthquakes
 ORDER BY (earthquake #>> '{properties, mag}')::numeric DESC NULLS LAST
 LIMIT 5;
 -- Bonus: Instead of using a path extraction operator (#>>), you can also use field extraction:
 SELECT earthquake -> 'properties' ->> 'place' AS place,
       to_timestamp((earthquake -> 'properties' ->> 'time')::bigint / 1000)
           AT TIME ZONE 'UTC' AS time,
       (earthquake #>> '{properties, mag}')::numeric AS magnitude
 FROM earthquakes
 ORDER BY (earthquake #>> '{properties, mag}')::numeric DESC NULLS LAST
 LIMIT 5;
 -- Listing 16-14: Finding earthquakes with most Did You Feel It? reports
 -- https://earthquake.usgs.gov/data/dyfi/
 SELECT earthquake #>> '{properties, place}' AS place,
       to_timestamp((earthquake #>> '{properties, time}')::bigint / 1000)
           AT TIME ZONE 'UTC' AS time,
       (earthquake #>> '{properties, mag}')::numeric AS magnitude,
       (earthquake #>> '{properties, felt}')::integer AS felt
 FROM earthquakes
 ORDER BY (earthquake #>> '{properties, felt}')::integer DESC NULLS LAST
 LIMIT 5;
 -- Listing 16-15: Extracting the earthquake's location data
 SELECT id,
       earthquake #>> '{geometry, coordinates}' AS coordinates,
       earthquake #>> '{geometry, coordinates, 0}' AS longitude,
       earthquake #>> '{geometry, coordinates, 1}' AS latitude
 FROM earthquakes
 ORDER BY id
 LIMIT 5;
 -- Listing 16-16: Converting JSON location data to PostGIS geography
 SELECT ST_SetSRID(
         ST_MakePoint(
            (earthquake #>> '{geometry, coordinates, 0}')::numeric,
            (earthquake #>> '{geometry, coordinates, 1}')::numeric
         ),
             4326)::geography AS earthquake_point
 FROM earthquakes
 ORDER BY id;
 -- Listing 16-17: Finding earthquakes within 50 miles of downtown Tulsa, Oklahoma
 -- Add a column of the geography data type 
 ALTER TABLE earthquakes ADD COLUMN earthquake_point geography(POINT, 4326);
 -- Update the earthquakes table with a Point
 UPDATE earthquakes
 SET earthquake_point = 
        ST_SetSRID(
            ST_MakePoint(
                (earthquake #>> '{geometry, coordinates, 0}')::numeric,
                (earthquake #>> '{geometry, coordinates, 1}')::numeric
             ),
                 4326)::geography;
 CREATE INDEX quake_pt_idx ON earthquakes USING GIST (earthquake_point);
 -- Earthquakes 50 miles from Tulsa
 SELECT earthquake #>> '{properties, place}' AS place,
       to_timestamp((earthquake -> 'properties' ->> 'time')::bigint / 1000)
           AT TIME ZONE 'UTC' AS time,
       (earthquake #>> '{properties, mag}')::numeric AS magnitude,
       earthquake_point
 FROM earthquakes
 WHERE ST_DWithin(earthquake_point,
                 ST_GeogFromText('POINT(-95.989505 36.155007)'),
                 80468)
 ORDER BY time;
 -- GENERATING AND MANIPULATING JSON
 -- Listing 16-18: Turning query results into JSON with row_to_json() and json_agg()
 -- Convert an entire row from the table
 SELECT to_json(employees) AS json_rows
 FROM employees;
 -- Selected columns from the table (returns key names as f1, f2, etc.)
 SELECT to_json(row(emp_id, last_name)) AS json_rows
 FROM employees;
 -- Generating key names with a subquery
 SELECT to_json(employees) AS json_rows
 FROM (
    SELECT emp_id, last_name AS ln FROM employees
 ) AS employees;
 -- Aggregating the rows and converting to JSON
 SELECT json_agg(to_json(employees)) AS json
 FROM (
    SELECT emp_id, last_name AS ln FROM employees
 ) AS employees;
 -- Listing 16-19: Adding a top-level key/value pair via concatenation
 -- Two examples
 UPDATE films
 SET film = film || '{"studio": "Pixar"}'::jsonb
 WHERE film @> '{"title": "The Incredibles"}'::jsonb; 
 UPDATE films
 SET film = film || jsonb_build_object('studio', 'Pixar')
 WHERE film @> '{"title": "The Incredibles"}'::jsonb; 
 SELECT film FROM films -- check the updated data
 WHERE film @> '{"title": "The Incredibles"}'::jsonb; 
 -- Listing 16-20: Setting an array value at a path
 UPDATE films
 SET film = jsonb_set(film,
                 '{genre}',
                  film #> '{genre}' || '["World War II"]',
                  false)
 WHERE film @> '{"title": "Cinema Paradiso"}'::jsonb; 
 SELECT film FROM films -- check the updated data
 WHERE film @> '{"title": "Cinema Paradiso"}'::jsonb; 
 -- Listing 16-21: Deleting values from JSON
 -- Removes the studio key/value pair from The Incredibles
 UPDATE films
 SET film = film - 'studio'
 WHERE film @> '{"title": "The Incredibles"}'::jsonb; 
 -- Removes the third element in the genre array of Cinema Paradiso
 UPDATE films
 SET film = film #- '{genre, 2}'
 WHERE film @> '{"title": "Cinema Paradiso"}'::jsonb; 
 -- JSON PROCESSING FUNCTIONS
 -- Listing 16-22: Finding the length of an array
 SELECT id,
       film ->> 'title' AS title,
       jsonb_array_length(film -> 'characters') AS num_characters
 FROM films
 ORDER BY id;
 -- Listing 16-23: Returning array elements as rows
 SELECT id,
       jsonb_array_elements(film -> 'genre') AS genre_jsonb,
       jsonb_array_elements_text(film -> 'genre') AS genre_text
 FROM films
 ORDER BY id;
 -- Listing 16-24: Returning key values from each item in an array
 SELECT id, 
       jsonb_array_elements(film -> 'characters')
 FROM films
 ORDER BY id;
 WITH characters (id, json) AS (
    SELECT id,
           jsonb_array_elements(film -> 'characters')
    FROM films
 )
 SELECT id, 
       json ->> 'name' AS name,
       json ->> 'actor' AS actor
 FROM characters
 ORDER BY id;
--- a/Chapter_16/earthquakes.json
+++ b/Chapter_16/earthquakes.json
--- a/Chapter_16/films.json
+++ b/Chapter_16/films.json
@ -0,0 +1,2 @@
 {"title": "The Incredibles", "year": 2004, "rating": {"MPAA": "PG"}, "characters": [{"name": "Mr. Incredible", "actor": "Craig T. Nelson"}, {"name": "Elastigirl", "actor": "Holly Hunter"}, {"name": "Frozone", "actor": "Samuel L. Jackson"}], "genre": ["animation", "action", "sci-fi"]}
 {"title": "Cinema Paradiso", "year": 1988, "characters": [{"name": "Salvatore", "actor": "Salvatore Cascio"}, {"name": "Alfredo", "actor": "Philippe Noiret"}], "genre": ["romance", "drama"]}
--- a/Try_It_Yourself/Try_It_Yourself.sql
+++ b/Try_It_Yourself/Try_It_Yourself.sql
@ -1012,3 +1012,128 @@ ORDER BY census.statefp, census.name;
 -- Note that this query also highlights a farmer's market that is mis-geocoded.
 -- Can you spot it?
 ----------------------------------------------------------------------------
 -- Chapter 16: Working with JSON Data
 ----------------------------------------------------------------------------
 -- 1. The earthquakes JSON has a key tsunami that’s set to a value of 1 for 
 -- large earthquakes in oceanic regions (though it doesn’t mean a tsunami 
 -- actually happened). Using either path or element extraction operators, 
 -- find earthquakes with a tsunami value of 1 and include their location, time
 -- and magnitude in your results.
 -- Answer:
 SELECT earthquake -> 'properties' ->> 'place' AS place,
       to_timestamp((earthquake -> 'properties' ->> 'time')::bigint / 1000) AS timestamp,
       (earthquake -> 'properties' ->> 'mag')::numeric AS mag
 FROM earthquakes
 WHERE (earthquake -> 'properties' ->> 'tsunami') = '1';
 -- Same result but using path extraction operators instead of field extraction operators
 SELECT earthquake #>> '{properties, place}' AS place,
       to_timestamp((earthquake -> 'properties' ->> 'time')::bigint / 1000) AS timestamp,
       (earthquake #>> '{properties, mag}')::numeric AS mag
 FROM earthquakes
 WHERE (earthquake #>> '{properties, tsunami}') = '1';
 -- 2. Use the following CREATE TABLE statement to add the table earthquakes_from_json
 -- to your analysis database:
 CREATE TABLE earthquakes_from_json (
    id text PRIMARY KEY,
    title text,
    type text,
    quake_date timestamp with time zone,
    mag numeric,
    place text,
    earthquake_point geography(POINT, 4326),
    url text
 );
 -- Using field and path extraction operators, write an INSERT statement
 -- to fill the table with the correct values for each earthquake. Refer 
 -- to the full sample earthquake JSON in your Chapter_16.sql file for any 
 -- key names and paths you need.
 -- Answer:
 -- Note that you could use either field or path operators to retrieve the needed elements.
 INSERT INTO earthquakes_from_json
 SELECT earthquake ->> 'id',
       earthquake -> 'properties' ->> 'title',
       earthquake -> 'properties' ->> 'type',
       to_timestamp((earthquake -> 'properties' ->> 'time')::bigint / 1000),
       (earthquake -> 'properties' ->> 'mag')::numeric,
       earthquake -> 'properties' ->> 'place',
       ST_SetSRID(
            ST_MakePoint(
                (earthquake #>> '{geometry, coordinates, 0}')::numeric,
                (earthquake #>> '{geometry, coordinates, 1}')::numeric
             ),
                 4326)::geography,
       earthquake -> 'properties' ->> 'url'
 FROM earthquakes;
 -- View the copied data:
 SELECT * FROM earthquakes_from_json;
 -- 3. Bonus (difficult) question: Try writing a query to generate the 
 -- following JSON using the data in the teachers and teachers_lab_access
 -- tables from Chapter 13:
 {
 	"id": 6,
 	"fn": "Kathleen",
 	"ln": "Roush",
 	"lab_access": [{
 		"lab_name": "Science B",
 		"access_time": "2022-12-17T16:00:00-05:00"
 	}, {
 		"lab_name": "Science A",
 		"access_time": "2022-12-07T10:02:00-05:00"
 	}]
 }
 -- It’s helpful to remember that the teachers table has a one-to-many relationship 
 -- with teachers_lab_access; the first three keys must come from teachers, and the 
 -- JSON objects in the array of lab_access will come from teachers_lab_access. 
 -- Hint: you’ll need to use a subquery in your SELECT list and a function called 
 -- json_agg() to create the lab_access array. 
 -- Answer:
 SELECT to_json(teachers_labs)
 FROM (
    SELECT id,
           first_name AS fn,
           last_name AS ln,
        (
            SELECT json_agg(to_json(la))
            FROM (
                SELECT lab_name, access_time
                FROM teachers_lab_access
                WHERE teacher_id = teachers.id
                ORDER BY access_time DESC
            ) AS la
        ) AS lab_access
    FROM teachers 
    WHERE id = 6)
 AS teachers_labs;
 -- What's happening here? Structurally, we have a series of nested subqueries.
 -- We use the first subquery to retrieve the id, first_name, and last_name columns from 
 -- the teachers table. In that subquery's SELECT list, we place another subquery that 
 -- retrieves the lab_name and access_time from the teachers_lab_access. 
 -- What makes this query tricky is that we need to use an inner subquery to generate the
 -- lab access objects and use the json_agg function to aggregate the results of those rows
 -- into an array.
 -- When writing a query such as this, it's helpful to work in chunks. Start with the 
 -- outermost query and add the subqueries one by one, testing as you go.
		`@ -0,0 +1,2 @@`
							`{"title": "The Incredibles", "year": 2004, "rating": {"MPAA": "PG"}, "characters": [{"name": "Mr. Incredible", "actor": "Craig T. Nelson"}, {"name": "Elastigirl", "actor": "Holly Hunter"}, {"name": "Frozone", "actor": "Samuel L. Jackson"}], "genre": ["animation", "action", "sci-fi"]}`
							`{"title": "Cinema Paradiso", "year": 1988, "characters": [{"name": "Salvatore", "actor": "Salvatore Cascio"}, {"name": "Alfredo", "actor": "Philippe Noiret"}], "genre": ["romance", "drama"]}`