Module `pyISARICBasics.functions`

Expand source code

import gc
import os
import sqlite3
import pandas as pd
import numpy as np

ALL_DOMAINS = {"DM", "DS", "ER", "HO", "IE", "IN", "LB", "MB", "RP", "RS", "SA", "SV", "VS", "CQ", "SC", "PO", "TI"}
"""
A set containing all possible domains in the ISARIC dataset. 
"""


### TODO write custom .hdf5 loader and saver functions for QUICKEST I/O
### This could be tricky -> it seems like the best option is to convert dtypes from object to pandas d types
### but then we need to handle stuff differently in other places in code...
### We also need to try and deal with the fact that it seems like df.to_hdf() freaks out with DF's over a certain size
### You can load and append using hdf but I am not sure how yet -> you can also query using hdf this is definitely worth
### looking into though.

def csv_to_sqlite(data_folder, db_file, overwrite=True):
    """
    Converts all raw .csv files to a sqlite database

    :param data_folder: Location of folder where data is contained

    :param db_file: Name of sqlite databse within data_folder

    :param overwrite: Rewrite sqlite database if it already exists

    :return: Null
    """
    # if os.path.isfile(os.path.join(data_folder, db_file)):
    #     print("Database already exists")
    #     return
    for file in os.listdir(data_folder):  # get all files in data_folder
        if file.endswith(".csv"):  # get csv files
            name = os.path.splitext(file)[0]  # file name with no extension
            print(name)
            name = parse_domain_names(name)
            name = name.replace('-', '_')  # replace - with _ to avoid sql errors.
            print("_" * 150)
            print("Creating table:", name)
            file_path = os.path.join(data_folder, file)

            df = pd.read_csv(file_path, on_bad_lines='skip', verbose=False)
            df = df.rename(columns=lambda x: x.strip())
            # print(df.dtypes)
            # df = df.convert_dtypes()
            #
            # print(df.dtypes)
            print("Length of df ", name, len(df))
            df_to_sqlite(df, name, data_folder, db_file, overwrite)
            del df
            gc.collect()


def df_to_sqlite(df, table_name, data_folder, data_file, overwrite=True):
    """
    Creates a table in sqlite database using the supplied dataframe, also saves .pickle files for each table saved to
    the sql database (these are much quicker to load in to memory using Python and Pandas).

    :param df: Dataframe to convert to sqlite table

    :param table_name: Table name of dataframe (to be saved as in database)

    :param data_folder: Location of folder where data is contained

    :param data_file: db_file: Name of sqlite database within data_folder

    :param overwrite: overwrite: Rewrite sqlite database if it already exists

    :return: True, if write successful
    """
    # Executes a query and returns a pandas dataframe
    # query is a string.
    # data_folder is the folder that sqlite file is in.
    # data_file is the name of sqlite file.
    if overwrite:
        if_exists = 'replace'
    else:
        if_exists = None
    try:
        db_file = os.path.join(data_folder, data_file)
        con = sqlite3.connect(db_file)  # connection to the database file
        df.to_sql(table_name, con, if_exists=if_exists, index=False)
        save_string = os.path.join(data_folder, f"{table_name}.pickle")
        df.to_pickle(save_string)
        del df
        gc.collect()
        return True
    except ValueError as e:
        print("Table already exists in database, set Overwrite = True if you wish to overwrite existing table.")
        return None
    finally:
        con.close()


def parse_domain_names(name: str) -> str:
    """
    Function to read domain name and return two letter abbreviation for each domain

    :param name: (str) filename similar to "Partner_DM_2021-09-20.csv"

    :return: (str) Two letter abbreviated domain name e.g. "DM" for above filename
    """
    domain = [i for i in name.split("_") if i in ALL_DOMAINS]
    if len(domain) > 1:
        print("Couldn't parse domain name from filename returning full path as domain name")
        return name
    else:
        return domain.pop()

Global variables

var ALL_DOMAINS: A set containing all possible domains in the ISARIC dataset.

Functions

def csv_to_sqlite(data_folder, db_file, overwrite=True)

Converts all raw .csv files to a sqlite database

:param data_folder: Location of folder where data is contained

:param db_file: Name of sqlite databse within data_folder

:param overwrite: Rewrite sqlite database if it already exists

:return: Null

Expand source code

def csv_to_sqlite(data_folder, db_file, overwrite=True):
    """
    Converts all raw .csv files to a sqlite database

    :param data_folder: Location of folder where data is contained

    :param db_file: Name of sqlite databse within data_folder

    :param overwrite: Rewrite sqlite database if it already exists

    :return: Null
    """
    # if os.path.isfile(os.path.join(data_folder, db_file)):
    #     print("Database already exists")
    #     return
    for file in os.listdir(data_folder):  # get all files in data_folder
        if file.endswith(".csv"):  # get csv files
            name = os.path.splitext(file)[0]  # file name with no extension
            print(name)
            name = parse_domain_names(name)
            name = name.replace('-', '_')  # replace - with _ to avoid sql errors.
            print("_" * 150)
            print("Creating table:", name)
            file_path = os.path.join(data_folder, file)

            df = pd.read_csv(file_path, on_bad_lines='skip', verbose=False)
            df = df.rename(columns=lambda x: x.strip())
            # print(df.dtypes)
            # df = df.convert_dtypes()
            #
            # print(df.dtypes)
            print("Length of df ", name, len(df))
            df_to_sqlite(df, name, data_folder, db_file, overwrite)
            del df
            gc.collect()

def df_to_sqlite(df, table_name, data_folder, data_file, overwrite=True)

Creates a table in sqlite database using the supplied dataframe, also saves .pickle files for each table saved to the sql database (these are much quicker to load in to memory using Python and Pandas).

:param df: Dataframe to convert to sqlite table

:param table_name: Table name of dataframe (to be saved as in database)

:param data_folder: Location of folder where data is contained

:param data_file: db_file: Name of sqlite database within data_folder

:param overwrite: overwrite: Rewrite sqlite database if it already exists

:return: True, if write successful

Expand source code

def df_to_sqlite(df, table_name, data_folder, data_file, overwrite=True):
    """
    Creates a table in sqlite database using the supplied dataframe, also saves .pickle files for each table saved to
    the sql database (these are much quicker to load in to memory using Python and Pandas).

    :param df: Dataframe to convert to sqlite table

    :param table_name: Table name of dataframe (to be saved as in database)

    :param data_folder: Location of folder where data is contained

    :param data_file: db_file: Name of sqlite database within data_folder

    :param overwrite: overwrite: Rewrite sqlite database if it already exists

    :return: True, if write successful
    """
    # Executes a query and returns a pandas dataframe
    # query is a string.
    # data_folder is the folder that sqlite file is in.
    # data_file is the name of sqlite file.
    if overwrite:
        if_exists = 'replace'
    else:
        if_exists = None
    try:
        db_file = os.path.join(data_folder, data_file)
        con = sqlite3.connect(db_file)  # connection to the database file
        df.to_sql(table_name, con, if_exists=if_exists, index=False)
        save_string = os.path.join(data_folder, f"{table_name}.pickle")
        df.to_pickle(save_string)
        del df
        gc.collect()
        return True
    except ValueError as e:
        print("Table already exists in database, set Overwrite = True if you wish to overwrite existing table.")
        return None
    finally:
        con.close()

def parse_domain_names(name: str) ‑> str

Function to read domain name and return two letter abbreviation for each domain

:param name: (str) filename similar to "Partner_DM_2021-09-20.csv"

:return: (str) Two letter abbreviated domain name e.g. "DM" for above filename

Expand source code

def parse_domain_names(name: str) -> str:
    """
    Function to read domain name and return two letter abbreviation for each domain

    :param name: (str) filename similar to "Partner_DM_2021-09-20.csv"

    :return: (str) Two letter abbreviated domain name e.g. "DM" for above filename
    """
    domain = [i for i in name.split("_") if i in ALL_DOMAINS]
    if len(domain) > 1:
        print("Couldn't parse domain name from filename returning full path as domain name")
        return name
    else:
        return domain.pop()