Crime Report DB in Postgresql¶

Objectives:

  • Create a database crimes_db with a table boston_crimes with appropriate data types
  • Create a schema and create the table inside it
  • Loading the data from boston.csv in the table
  • Create readonly and readwrite groups with appropriate privileges
  • Create users for each of the groups

Create DB and Schema¶

In [ ]:
# Read user_name and password from Envirenmental Variables in .env file
from dotenv import load_dotenv
import os

load_dotenv()

username = os.environ.get('user_name') 
pwd = os.environ.get('password')
In [1]:
# Import library and create the DB and close the connection
import psycopg2

connection_str = "user={} password={} host=127.0.0.1 port=5432 dbname=DB".format(username,pwd) 

conn = psycopg2.connect(connection_str)
cur = conn.cursor()
# set autocommit to True bacause this is required for creating databases
# CREATE DATABASE can only be executed as a single
conn.autocommit = True 

cur.execute("CREATE DATABASE crimes_db;")
conn.close()
In [2]:
connection_str = "user={} password={} host=127.0.0.1 port=5432 dbname=crimes_db".format(username,pwd) 
conn = psycopg2.connect(connection_str)
cur = conn.cursor()

# Create a Schema in DB
cur.execute("CREATE SCHEMA crimes;")
In [3]:
# Let's Check Schema and DBs
qu_schema = """SELECT schema_name
                FROM information_schema.schemata;
            """
cur.execute(qu_schema)
schema_name = cur.fetchall()

cur.execute("SELECT current_database();")
dbname = cur.fetchall()

print("Schema name  = {} \n DB name = {}".format(schema_name, dbname))
Schema name  = [('pg_toast',), ('pg_temp_1',), ('pg_toast_temp_1',), ('pg_catalog',), ('information_schema',), ('public',), ('crimes',)] 
 DB name = [('crimes_db',)]

Obtain Column Names and Sample¶

In [24]:
import csv 
with open('boston.csv', 'r') as f:
    reader = csv.reader(f)
    col_headers = next(reader)
    first_row = next(reader)
    
with open('boston.csv', 'r') as f:
    rows = list(f) 
    for idx, row in enumerate(rows[:3], start=1):
        print(idx, row, sep='\t')
        
print("Number of rows in the file excluding header = {}".format(len(rows)-1))
1	incident_number,offense_code,description,date,day_of_the_week,lat,long

2	1,619,LARCENY ALL OTHERS,2018-09-02,Sunday,42.35779134,-71.13937053

3	2,1402,VANDALISM,2018-08-21,Tuesday,42.30682138,-71.06030035

Number of rows in the file = 298330

Helper function to analyze column values¶

Create a function get_col_set that returns a set of all distinct values of a column by passing in CSV file name and column index

Helps to see if the column is categorical with limited set of values (To use a Enumerated datatype) and check the max length of charactes in a column

In [5]:
def get_col_set(csv_file, col_idx):
    import csv
    
    with open(csv_file, 'r') as f:
        next(f)
        reader = csv.reader(f)
        col_set = set()
        for row in reader:
            col_set.add(row[col_idx])     
    return col_set
In [6]:
for col in range(len(col_headers)):
    values = get_col_set("boston.csv", col)
    print(col_headers[col], len(values), sep='\t')
incident_number	298329
offense_code	219
description	239
date	1177
day_of_the_week	7
lat	18177
long	18177

Analyzing the maximum length or value range¶

  • description and day_of_the_week are textual columns
  • Longest text in day_of_the_week must be Wednesday but we need to compute that for the other column
In [16]:
col_headers
Out[16]:
['incident_number',
 'offense_code',
 'description',
 'date',
 'day_of_the_week',
 'lat',
 'long']
In [7]:
# Max characters
desc_set = get_col_set("boston.csv", 2)

max_len = max(desc_set, key=len)
len(max_len)
Out[7]:
58
  • Lat and Long columns have typically 8 digits after decimal and 2 before
In [14]:
# Max Precision and Scale 
long_set = get_col_set("boston.csv", 6)

max_len = max(long_set, key=len)
max_len
Out[14]:
'-71.14678737'
In [9]:
lat_set = get_col_set("boston.csv", 5)
max_dig = max(lat_set, key=len)
max_dig
Out[9]:
'42.29297429'
  • Check the maximum value in offense_code and distribution to decide whether smallint is big enough for this column
In [19]:
# Max value and distribution
with open("boston.csv", 'r') as f:
    next(f)
    reader = csv.reader(f)
    col = []
    for row in reader:
        col.append(int(row[1]))
        
max(col)
Out[19]:
3831
In [20]:
import pandas as pd
%matplotlib inline

pd.Series(col).hist()
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efe9dda6a58>

Creating the table¶

I will use the same column name in the DB.

Table Datatypes:

For incident_number could contain big numbers as the table grows so numeric useful as it has variable byte size. It is also set as the primary key.

offense_code are positive integer with 4 digits maximum, so smallint with 2 byte would be suitable.

description is textual and maximum 58 characters so varchar(n) with size 100 so we leave some margin while not wasting extra memory.

For day_of_the_week, we created a Enum type called week because there are only seven possible values.

The date was represented as the DATE datatype. lat and long both have 8 digits after decimal and two before so I chose Decimal(10,8)

In [ ]:
# First create the Enum datatype

qu_enum = """
    CREATE TYPE week AS ENUM (
    'Friday', 'Monday', 'Saturday', 
    'Sunday', 'Thursday', 'Tuesday', 
    'Wednesday'
)"""

cur.execute(qu_enum)
conn.commit()
In [ ]:
# Create the table

qu_table = """
    CREATE TABLE crimes.boston_crimes (
    incident_number numeric PRIMARY KEY,
    offense_code smallint,
    description VARCHAR(100),
    date date,
    day_of_the_week week,
    latitude decimal(10,8),
    longitude decimal(10,8)      
);"""

cur.execute(qu_table)
conn.commit()

Load the csv file into the table¶

I used copy_expert method to load the data which is fast and easy to use.

In [ ]:
# Load the data from boston.csv into boston_crimes table in the crimes schema

with open('boston.csv', 'r') as f:
    cur.copy_expert("COPY crimes.boston_crimes FROM STDIN WITH CSV HEADER;", f)
In [25]:
# print number of rows to ensure that they were loaded 
# Number of rows in the file excluding header = 298329

cur.execute("SELECT * FROM crimes.boston_crimes;")
print(len(cur.fetchall()))
298329

Creating groups and handling privileges¶

  • To avoid privilege inheretance from public group, we revoke all privileges from it

  • Next, we create groups with suitable privileges and add users to groups so this way we don't need to permitt of revoke for each user individually. (Users inherite privileges from their groups)

  • Lastly, create users and assign to the groups with privileges suitable for the user

Revoke public privileges¶

We revoke all privileges of the public public group on the public schema to ensure that users will not inherit privileges on that schema such as the ability to create tables in the public schema.

We also need to revoke all privileges in the newly created schema. Doing this also makes it so that we do not need to revoke the privileges when we create users and groups because unless specified otherwise, privileges are not granted by default.

In [ ]:
cur.execute("REVOKE ALL ON SCHEMA public FROM public;")
cur.execute("REVOKE ALL ON DATABASE crimes_db FROM public;")

conn.commit()

Create Readonly Group¶

In [ ]:
# Create readonly and readwrite groups
cur.execute("CREATE GROUP readonly NOLOGIN;")

# Grant Connection privilege to both so they could connect to DB
cur.execute("GRANT CONNECT ON DATABASE crimes_db TO readonly;")

# Grant Usage privilege to both for using tables in a Schema
cur.execute('GRANT USAGE ON SCHEMA crimes TO readonly;')

# Only SELECT for readonly
cur.execute('GRANT SELECT ON ALL TABLES IN SCHEMA crimes TO readonly;')

conn.commit()

Create Readwrite Group¶

In [ ]:
cur.execute("CREATE GROUP readwrite  NOLOGIN;")
cur.execute("GRANT CONNECT ON DATABASE crimes_db TO readwrite;")
cur.execute('GRANT USAGE ON SCHEMA crimes TO readwrite;')
cur.execute('GRANT INSERT, SELECT, DELETE, UPDATE ON ALL TABLES IN SCHEMA crimes TO readwrite;')

conn.commit()

Creat one user for each of the groups¶

I created data_analyst with a password and assigned it to readonly group.

I created data_scientis with a password and assigned it to readwrite group.

In [ ]:
# CREATE Users and assign to appropriate groups
cur.execute("CREATE USER data_analyst WITH PASSWORD 'secret1';")
cur.execute("GRANT data_analyst TO readonly;")

cur.execute("CREATE USER data_scientist WITH PASSWORD 'secret2';")
cur.execute("GRANT data_scientist TO readwrite;")

conn.commit()

Test Database Setup¶

Test the DB setup using SQL queries on the pg_user, pg_roles and information_schema.table_privileges tables.

In information_schema.table_privileges table I will check privileges realted to type of SQL queries on the table. I will list the group name and the SQL queries, users in each group are allowed to make.

In [27]:
# Helper function to fetch and print in readable format
def fetch_all(cursor):
    values = cursor.fetchall()
    for value in values:
        print(value)
    print('\n')
    return values

Here are description of each value in the result of query on pg_user table

pg_user documentation

  • From Left to right:
    • usename
    • ID of this user
    • User can create databases
    • User is a superuser
    • User can initiate streaming replication
    • User bypasses every row-level security policy
    • password
    • Password expiry time
    • Session defaults for run-time configuration variables
In [28]:
# Let's Check the users and privileges from "pg_user" and "information_schema.table_privileges"
cur.execute("""
    SELECT * 
    FROM pg_user
    WHERE usename IN ('data_analyst', 'data_scientist');
""")
users = fetch_all(cur)

# Privileges realted to readonly and readwrite groups
cur.execute('''SELECT grantor, grantee, privilege_type 
                FROM information_schema.table_privileges 
                WHERE grantee IN ('readonly', 'readwrite');''')
privil = fetch_all(cur)
('data_analyst', 16413, False, False, False, False, '********', None, None)
('data_scientist', 16414, False, False, False, False, '********', None, None)


('dq', 'readwrite', 'INSERT')
('dq', 'readwrite', 'SELECT')
('dq', 'readwrite', 'UPDATE')
('dq', 'readwrite', 'DELETE')
('dq', 'readonly', 'SELECT')


Here are description of what each value means in the result from pg_roles table

pg_roles documentation

  • From Left to right:
    • Role name
    • Role has superuser privileges
    • Role automatically inherits privileges of roles it is a member of
    • Role can create more roles
    • Role can create databases
    • Role can log in
    • ...
    • ID of role
In [29]:
# Here I check "pg_roles" (It shows both users and groups)
cur.execute("""
    SELECT * 
    FROM pg_roles
    WHERE rolname IN ('readonly', 'readwrite', 'data_analyst', 'data_scientist');
""")

pg_roles = fetch_all(cur)

# Privileges realted to boston_crimes table
cur.execute('''SELECT grantor, grantee, privilege_type 
                FROM information_schema.table_privileges 
                WHERE table_name = 'boston_crimes';''')
privil_table = fetch_all(cur)
('readonly', False, True, False, False, False, False, False, -1, '********', None, None, 16411)
('readwrite', False, True, False, False, False, False, False, -1, '********', None, None, 16412)
('data_analyst', False, True, False, False, False, True, False, -1, '********', None, None, 16413)
('data_scientist', False, True, False, False, False, True, False, -1, '********', None, None, 16414)


('dq', 'dq', 'INSERT')
('dq', 'dq', 'SELECT')
('dq', 'dq', 'UPDATE')
('dq', 'dq', 'DELETE')
('dq', 'dq', 'TRUNCATE')
('dq', 'dq', 'REFERENCES')
('dq', 'dq', 'TRIGGER')
('dq', 'readwrite', 'INSERT')
('dq', 'readwrite', 'SELECT')
('dq', 'readwrite', 'UPDATE')
('dq', 'readwrite', 'DELETE')
('dq', 'readonly', 'SELECT')