# Read user_name and password from Envirenmental Variables in .env file
from dotenv import load_dotenv
import os

load_dotenv()

username = os.environ.get('user_name') 
pwd = os.environ.get('password')


# Import library and create the DB and close the connection
import psycopg2

connection_str = "user={} password={} host=127.0.0.1 port=5432 dbname=DB".format(username,pwd) 

conn = psycopg2.connect(connection_str)
cur = conn.cursor()
# set autocommit to True bacause this is required for creating databases
# CREATE DATABASE can only be executed as a single
conn.autocommit = True 

cur.execute("CREATE DATABASE crimes_db;")
conn.close()


connection_str = "user={} password={} host=127.0.0.1 port=5432 dbname=crimes_db".format(username,pwd) 
conn = psycopg2.connect(connection_str)
cur = conn.cursor()

# Create a Schema in DB
cur.execute("CREATE SCHEMA crimes;")


# Let's Check Schema and DBs
qu_schema = """SELECT schema_name
                FROM information_schema.schemata;
            """
cur.execute(qu_schema)
schema_name = cur.fetchall()

cur.execute("SELECT current_database();")
dbname = cur.fetchall()

print("Schema name  = {} \n DB name = {}".format(schema_name, dbname))

Schema name  = [('pg_toast',), ('pg_temp_1',), ('pg_toast_temp_1',), ('pg_catalog',), ('information_schema',), ('public',), ('crimes',)] 
 DB name = [('crimes_db',)]


import csv 
with open('boston.csv', 'r') as f:
    reader = csv.reader(f)
    col_headers = next(reader)
    first_row = next(reader)
    
with open('boston.csv', 'r') as f:
    rows = list(f) 
    for idx, row in enumerate(rows[:3], start=1):
        print(idx, row, sep='\t')
        
print("Number of rows in the file excluding header = {}".format(len(rows)-1))

1	incident_number,offense_code,description,date,day_of_the_week,lat,long

2	1,619,LARCENY ALL OTHERS,2018-09-02,Sunday,42.35779134,-71.13937053

3	2,1402,VANDALISM,2018-08-21,Tuesday,42.30682138,-71.06030035

Number of rows in the file = 298330


def get_col_set(csv_file, col_idx):
    import csv
    
    with open(csv_file, 'r') as f:
        next(f)
        reader = csv.reader(f)
        col_set = set()
        for row in reader:
            col_set.add(row[col_idx])     
    return col_set


for col in range(len(col_headers)):
    values = get_col_set("boston.csv", col)
    print(col_headers[col], len(values), sep='\t')

incident_number	298329
offense_code	219
description	239
date	1177
day_of_the_week	7
lat	18177
long	18177


col_headers

['incident_number',
 'offense_code',
 'description',
 'date',
 'day_of_the_week',
 'lat',
 'long']


# Max characters
desc_set = get_col_set("boston.csv", 2)

max_len = max(desc_set, key=len)
len(max_len)

58


# Max Precision and Scale 
long_set = get_col_set("boston.csv", 6)

max_len = max(long_set, key=len)
max_len

'-71.14678737'


lat_set = get_col_set("boston.csv", 5)
max_dig = max(lat_set, key=len)
max_dig

'42.29297429'


# Max value and distribution
with open("boston.csv", 'r') as f:
    next(f)
    reader = csv.reader(f)
    col = []
    for row in reader:
        col.append(int(row[1]))
        
max(col)

3831


import pandas as pd
%matplotlib inline

pd.Series(col).hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7efe9dda6a58>


# First create the Enum datatype

qu_enum = """
    CREATE TYPE week AS ENUM (
    'Friday', 'Monday', 'Saturday', 
    'Sunday', 'Thursday', 'Tuesday', 
    'Wednesday'
)"""

cur.execute(qu_enum)
conn.commit()


# Create the table

qu_table = """
    CREATE TABLE crimes.boston_crimes (
    incident_number numeric PRIMARY KEY,
    offense_code smallint,
    description VARCHAR(100),
    date date,
    day_of_the_week week,
    latitude decimal(10,8),
    longitude decimal(10,8)      
);"""

cur.execute(qu_table)
conn.commit()


# Load the data from boston.csv into boston_crimes table in the crimes schema

with open('boston.csv', 'r') as f:
    cur.copy_expert("COPY crimes.boston_crimes FROM STDIN WITH CSV HEADER;", f)


# print number of rows to ensure that they were loaded 
# Number of rows in the file excluding header = 298329

cur.execute("SELECT * FROM crimes.boston_crimes;")
print(len(cur.fetchall()))

298329


cur.execute("REVOKE ALL ON SCHEMA public FROM public;")
cur.execute("REVOKE ALL ON DATABASE crimes_db FROM public;")

conn.commit()


# Create readonly and readwrite groups
cur.execute("CREATE GROUP readonly NOLOGIN;")

# Grant Connection privilege to both so they could connect to DB
cur.execute("GRANT CONNECT ON DATABASE crimes_db TO readonly;")

# Grant Usage privilege to both for using tables in a Schema
cur.execute('GRANT USAGE ON SCHEMA crimes TO readonly;')

# Only SELECT for readonly
cur.execute('GRANT SELECT ON ALL TABLES IN SCHEMA crimes TO readonly;')

conn.commit()


cur.execute("CREATE GROUP readwrite  NOLOGIN;")
cur.execute("GRANT CONNECT ON DATABASE crimes_db TO readwrite;")
cur.execute('GRANT USAGE ON SCHEMA crimes TO readwrite;')
cur.execute('GRANT INSERT, SELECT, DELETE, UPDATE ON ALL TABLES IN SCHEMA crimes TO readwrite;')

conn.commit()


# CREATE Users and assign to appropriate groups
cur.execute("CREATE USER data_analyst WITH PASSWORD 'secret1';")
cur.execute("GRANT data_analyst TO readonly;")

cur.execute("CREATE USER data_scientist WITH PASSWORD 'secret2';")
cur.execute("GRANT data_scientist TO readwrite;")

conn.commit()


# Helper function to fetch and print in readable format
def fetch_all(cursor):
    values = cursor.fetchall()
    for value in values:
        print(value)
    print('\n')
    return values


# Let's Check the users and privileges from "pg_user" and "information_schema.table_privileges"
cur.execute("""
    SELECT * 
    FROM pg_user
    WHERE usename IN ('data_analyst', 'data_scientist');
""")
users = fetch_all(cur)

# Privileges realted to readonly and readwrite groups
cur.execute('''SELECT grantor, grantee, privilege_type 
                FROM information_schema.table_privileges 
                WHERE grantee IN ('readonly', 'readwrite');''')
privil = fetch_all(cur)

('data_analyst', 16413, False, False, False, False, '********', None, None)
('data_scientist', 16414, False, False, False, False, '********', None, None)


('dq', 'readwrite', 'INSERT')
('dq', 'readwrite', 'SELECT')
('dq', 'readwrite', 'UPDATE')
('dq', 'readwrite', 'DELETE')
('dq', 'readonly', 'SELECT')


# Here I check "pg_roles" (It shows both users and groups)
cur.execute("""
    SELECT * 
    FROM pg_roles
    WHERE rolname IN ('readonly', 'readwrite', 'data_analyst', 'data_scientist');
""")

pg_roles = fetch_all(cur)

# Privileges realted to boston_crimes table
cur.execute('''SELECT grantor, grantee, privilege_type 
                FROM information_schema.table_privileges 
                WHERE table_name = 'boston_crimes';''')
privil_table = fetch_all(cur)

('readonly', False, True, False, False, False, False, False, -1, '********', None, None, 16411)
('readwrite', False, True, False, False, False, False, False, -1, '********', None, None, 16412)
('data_analyst', False, True, False, False, False, True, False, -1, '********', None, None, 16413)
('data_scientist', False, True, False, False, False, True, False, -1, '********', None, None, 16414)


('dq', 'dq', 'INSERT')
('dq', 'dq', 'SELECT')
('dq', 'dq', 'UPDATE')
('dq', 'dq', 'DELETE')
('dq', 'dq', 'TRUNCATE')
('dq', 'dq', 'REFERENCES')
('dq', 'dq', 'TRIGGER')
('dq', 'readwrite', 'INSERT')
('dq', 'readwrite', 'SELECT')
('dq', 'readwrite', 'UPDATE')
('dq', 'readwrite', 'DELETE')
('dq', 'readonly', 'SELECT')

Crime Report DB in Postgresql¶

Create DB and Schema¶

Obtain Column Names and Sample¶

Helper function to analyze column values¶

Analyzing the maximum length or value range¶

Creating the table¶

Load the csv file into the table¶

Creating groups and handling privileges¶

Revoke public privileges¶

Create Readonly Group¶

Create Readwrite Group¶

Creat one user for each of the groups¶

Test Database Setup¶