Objectives:
crimes_db
with a table boston_crimes
with appropriate data typesboston.csv
in the tablereadonly
and readwrite
groups with appropriate privileges# Read user_name and password from Envirenmental Variables in .env file
from dotenv import load_dotenv
import os
load_dotenv()
username = os.environ.get('user_name')
pwd = os.environ.get('password')
# Import library and create the DB and close the connection
import psycopg2
connection_str = "user={} password={} host=127.0.0.1 port=5432 dbname=DB".format(username,pwd)
conn = psycopg2.connect(connection_str)
cur = conn.cursor()
# set autocommit to True bacause this is required for creating databases
# CREATE DATABASE can only be executed as a single
conn.autocommit = True
cur.execute("CREATE DATABASE crimes_db;")
conn.close()
connection_str = "user={} password={} host=127.0.0.1 port=5432 dbname=crimes_db".format(username,pwd)
conn = psycopg2.connect(connection_str)
cur = conn.cursor()
# Create a Schema in DB
cur.execute("CREATE SCHEMA crimes;")
# Let's Check Schema and DBs
qu_schema = """SELECT schema_name
FROM information_schema.schemata;
"""
cur.execute(qu_schema)
schema_name = cur.fetchall()
cur.execute("SELECT current_database();")
dbname = cur.fetchall()
print("Schema name = {} \n DB name = {}".format(schema_name, dbname))
Schema name = [('pg_toast',), ('pg_temp_1',), ('pg_toast_temp_1',), ('pg_catalog',), ('information_schema',), ('public',), ('crimes',)] DB name = [('crimes_db',)]
import csv
with open('boston.csv', 'r') as f:
reader = csv.reader(f)
col_headers = next(reader)
first_row = next(reader)
with open('boston.csv', 'r') as f:
rows = list(f)
for idx, row in enumerate(rows[:3], start=1):
print(idx, row, sep='\t')
print("Number of rows in the file excluding header = {}".format(len(rows)-1))
1 incident_number,offense_code,description,date,day_of_the_week,lat,long 2 1,619,LARCENY ALL OTHERS,2018-09-02,Sunday,42.35779134,-71.13937053 3 2,1402,VANDALISM,2018-08-21,Tuesday,42.30682138,-71.06030035 Number of rows in the file = 298330
Create a function get_col_set
that returns a set of all distinct values of a column by passing in CSV file name and column index
Helps to see if the column is categorical with limited set of values (To use a Enumerated datatype) and check the max length of charactes in a column
def get_col_set(csv_file, col_idx):
import csv
with open(csv_file, 'r') as f:
next(f)
reader = csv.reader(f)
col_set = set()
for row in reader:
col_set.add(row[col_idx])
return col_set
for col in range(len(col_headers)):
values = get_col_set("boston.csv", col)
print(col_headers[col], len(values), sep='\t')
incident_number 298329 offense_code 219 description 239 date 1177 day_of_the_week 7 lat 18177 long 18177
description
and day_of_the_week
are textual columnsday_of_the_week
must be Wednesday but we need to compute
that for the other columncol_headers
['incident_number', 'offense_code', 'description', 'date', 'day_of_the_week', 'lat', 'long']
# Max characters
desc_set = get_col_set("boston.csv", 2)
max_len = max(desc_set, key=len)
len(max_len)
58
# Max Precision and Scale
long_set = get_col_set("boston.csv", 6)
max_len = max(long_set, key=len)
max_len
'-71.14678737'
lat_set = get_col_set("boston.csv", 5)
max_dig = max(lat_set, key=len)
max_dig
'42.29297429'
offense_code
and distribution to decide whether smallint
is big enough for this column# Max value and distribution
with open("boston.csv", 'r') as f:
next(f)
reader = csv.reader(f)
col = []
for row in reader:
col.append(int(row[1]))
max(col)
3831
import pandas as pd
%matplotlib inline
pd.Series(col).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7efe9dda6a58>
I will use the same column name in the DB.
Table Datatypes:
For incident_number
could contain big numbers as the table grows so numeric
useful as it has variable byte size. It is also set as the primary key.
offense_code
are positive integer with 4 digits maximum, so smallint
with 2 byte would be suitable.
description
is textual and maximum 58 characters so varchar(n)
with size 100 so we leave some margin while not wasting extra memory.
For day_of_the_week
, we created a Enum type called week
because there are only seven possible values.
The date was represented as the DATE
datatype. lat
and long
both have 8 digits after decimal and two before so I chose Decimal(10,8)
# First create the Enum datatype
qu_enum = """
CREATE TYPE week AS ENUM (
'Friday', 'Monday', 'Saturday',
'Sunday', 'Thursday', 'Tuesday',
'Wednesday'
)"""
cur.execute(qu_enum)
conn.commit()
# Create the table
qu_table = """
CREATE TABLE crimes.boston_crimes (
incident_number numeric PRIMARY KEY,
offense_code smallint,
description VARCHAR(100),
date date,
day_of_the_week week,
latitude decimal(10,8),
longitude decimal(10,8)
);"""
cur.execute(qu_table)
conn.commit()
I used copy_expert
method to load the data which is fast and easy to use.
# Load the data from boston.csv into boston_crimes table in the crimes schema
with open('boston.csv', 'r') as f:
cur.copy_expert("COPY crimes.boston_crimes FROM STDIN WITH CSV HEADER;", f)
# print number of rows to ensure that they were loaded
# Number of rows in the file excluding header = 298329
cur.execute("SELECT * FROM crimes.boston_crimes;")
print(len(cur.fetchall()))
298329
To avoid privilege inheretance from public group, we revoke all privileges from it
Next, we create groups with suitable privileges and add users to groups so this way we don't need to permitt of revoke for each user individually. (Users inherite privileges from their groups)
Lastly, create users and assign to the groups with privileges suitable for the user
We revoke all privileges of the public public group on the public schema to ensure that users will not inherit privileges on that schema such as the ability to create tables in the public schema.
We also need to revoke all privileges in the newly created schema. Doing this also makes it so that we do not need to revoke the privileges when we create users and groups because unless specified otherwise, privileges are not granted by default.
cur.execute("REVOKE ALL ON SCHEMA public FROM public;")
cur.execute("REVOKE ALL ON DATABASE crimes_db FROM public;")
conn.commit()
# Create readonly and readwrite groups
cur.execute("CREATE GROUP readonly NOLOGIN;")
# Grant Connection privilege to both so they could connect to DB
cur.execute("GRANT CONNECT ON DATABASE crimes_db TO readonly;")
# Grant Usage privilege to both for using tables in a Schema
cur.execute('GRANT USAGE ON SCHEMA crimes TO readonly;')
# Only SELECT for readonly
cur.execute('GRANT SELECT ON ALL TABLES IN SCHEMA crimes TO readonly;')
conn.commit()
cur.execute("CREATE GROUP readwrite NOLOGIN;")
cur.execute("GRANT CONNECT ON DATABASE crimes_db TO readwrite;")
cur.execute('GRANT USAGE ON SCHEMA crimes TO readwrite;')
cur.execute('GRANT INSERT, SELECT, DELETE, UPDATE ON ALL TABLES IN SCHEMA crimes TO readwrite;')
conn.commit()
I created data_analyst
with a password and assigned it to readonly
group.
I created data_scientis
with a password and assigned it to readwrite
group.
# CREATE Users and assign to appropriate groups
cur.execute("CREATE USER data_analyst WITH PASSWORD 'secret1';")
cur.execute("GRANT data_analyst TO readonly;")
cur.execute("CREATE USER data_scientist WITH PASSWORD 'secret2';")
cur.execute("GRANT data_scientist TO readwrite;")
conn.commit()
Test the DB setup using SQL queries on the pg_user
, pg_roles
and information_schema.table_privileges
tables.
In information_schema.table_privileges
table I will check privileges realted to type of SQL queries on the table. I will list the group name and the SQL queries, users in each group are allowed to make.
# Helper function to fetch and print in readable format
def fetch_all(cursor):
values = cursor.fetchall()
for value in values:
print(value)
print('\n')
return values
Here are description of each value in the result of query on pg_user
table
pg_user documentation
# Let's Check the users and privileges from "pg_user" and "information_schema.table_privileges"
cur.execute("""
SELECT *
FROM pg_user
WHERE usename IN ('data_analyst', 'data_scientist');
""")
users = fetch_all(cur)
# Privileges realted to readonly and readwrite groups
cur.execute('''SELECT grantor, grantee, privilege_type
FROM information_schema.table_privileges
WHERE grantee IN ('readonly', 'readwrite');''')
privil = fetch_all(cur)
('data_analyst', 16413, False, False, False, False, '********', None, None) ('data_scientist', 16414, False, False, False, False, '********', None, None) ('dq', 'readwrite', 'INSERT') ('dq', 'readwrite', 'SELECT') ('dq', 'readwrite', 'UPDATE') ('dq', 'readwrite', 'DELETE') ('dq', 'readonly', 'SELECT')
Here are description of what each value means in the result from pg_roles
table
pg_roles documentation
# Here I check "pg_roles" (It shows both users and groups)
cur.execute("""
SELECT *
FROM pg_roles
WHERE rolname IN ('readonly', 'readwrite', 'data_analyst', 'data_scientist');
""")
pg_roles = fetch_all(cur)
# Privileges realted to boston_crimes table
cur.execute('''SELECT grantor, grantee, privilege_type
FROM information_schema.table_privileges
WHERE table_name = 'boston_crimes';''')
privil_table = fetch_all(cur)
('readonly', False, True, False, False, False, False, False, -1, '********', None, None, 16411) ('readwrite', False, True, False, False, False, False, False, -1, '********', None, None, 16412) ('data_analyst', False, True, False, False, False, True, False, -1, '********', None, None, 16413) ('data_scientist', False, True, False, False, False, True, False, -1, '********', None, None, 16414) ('dq', 'dq', 'INSERT') ('dq', 'dq', 'SELECT') ('dq', 'dq', 'UPDATE') ('dq', 'dq', 'DELETE') ('dq', 'dq', 'TRUNCATE') ('dq', 'dq', 'REFERENCES') ('dq', 'dq', 'TRIGGER') ('dq', 'readwrite', 'INSERT') ('dq', 'readwrite', 'SELECT') ('dq', 'readwrite', 'UPDATE') ('dq', 'readwrite', 'DELETE') ('dq', 'readonly', 'SELECT')