Skip to content
Snippets Groups Projects
daily_europe.py 14 KiB
Newer Older
Arshana Jain's avatar
Arshana Jain committed
import pandas as pd
import datetime
Arshana Jain's avatar
Arshana Jain committed
import sys
import sqlite3
Arshana Jain's avatar
Arshana Jain committed
import json
Arshana Jain's avatar
Arshana Jain committed
sys.path.append("..")
Arshana Jain's avatar
Arshana Jain committed
from util import *
Arshana Jain's avatar
Arshana Jain committed
    conn = sqlite3.connect('sqlite_db')
    c = conn.cursor()

    src_url = "https://github.com/ec-jrc/COVID-19"
    set_source(src_url, c, conn)
    src_id = get_source_id(src_url, c)

    daily_jrc_countries(src_id, c, conn)
    c.close()

# Use this for European countries only. Other countries appear to be either unreliable or have a lot of holes in their data.
# Using this in addition to JHU data, because it includes hospitalization data, while JHU does not
def daily_jrc_countries(src_id, c, conn):
    prev_death_dict = {}
    prev_recovered_dict = {}
    prev_cases_dict = {}

    i = 0
    with open('jrc_countries.json', 'r') as f:
        for line in f:
            if i == 0:
                prev_death_dict = json.loads(line)
            elif i == 1:
                prev_recovered_dict = json.loads(line)
            elif i == 2:
                prev_cases_dict = json.loads(line)
            i += 1
        f.close()

    missing_countries_set = set(())  # used to keep track of any countries that might need to be added to the countries table - for debugging purposes

    # Certain countries have strange data, regardless of if they are in the EU or not. This set has includes country names that don't seem to have that strange pattern.
    acceptable_countries_set = set(("Germany", "United Kingdom", "Italy", "Spain", "Romania", "Netherlands", "Belgium", "Sweden", "Austria", "Switzerland", "Slovakia", "Norway", "Albania"))  

    dt = datetime.datetime.today() - datetime.timedelta(days=2)
    for i in range(0, 3):
        date = jrc_date(dt)
        sql = '''SELECT date_collected FROM Cases_Per_Country WHERE date_collected = ? AND source_id = ?'''
        c.execute(sql, (date, src_id))
        already_entered = c.fetchall() != []
        if not already_entered:
            try:
                csv = "https://raw.githubusercontent.com/ec-jrc/COVID-19/master/data-by-country/jrc-covid-19-countries-" + date + ".csv"
                df = pd.read_csv(csv)
                for row in df.itertuples():
                    if row.EUcountry is True and row.CountryName in acceptable_countries_set:
                        country_code = get_country_code(row.CountryName, c)
                        
                        if country_code == None:
                            missing_countries_set.add(row.CountryName)

                        else:
                            sql = '''SELECT date_collected FROM Cases_Per_Country WHERE date_collected = ? AND source_id = ? AND country_code = ?'''
                            c.execute(sql, (row.Date, src_id, country_code))
                            already_entered = c.fetchall() != []
                            if not already_entered:
                                prev_death = 0 if country_code not in prev_death_dict else prev_death_dict[country_code]
                                prev_recovered = 0 if country_code not in prev_recovered_dict else prev_recovered_dict[country_code]
                                prev_cases = 0 if country_code not in prev_cases_dict else prev_cases_dict[country_code]

                                deaths = (row.CumulativeDeceased - prev_death) if isNum(row.CumulativeDeceased) else None
                                cases = (row.CumulativePositive - prev_cases) if isNum(row.CumulativePositive) else None
                                recovered = (row.CumulativeRecovered - prev_recovered) if isNum(row.CumulativeRecovered) else None
                                hospitalized = int(row.Hospitalized) if isNum(row.Hospitalized) else None
                                
                                sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers, hospitalization_numbers) VALUES (?, ?, ?, ?, ?, ?, ?)'''
                                c.execute(sql,(country_code, row.Date, src_id, deaths, cases, recovered, hospitalized))
                                
                                if isNum(row.CumulativeDeceased):
                                    prev_death_dict[country_code] = row.CumulativeDeceased
                                if isNum(row.CumulativeRecovered):
                                    prev_recovered_dict[country_code] = row.CumulativeRecovered
                                if isNum(row.CumulativePositive):
                                    prev_cases_dict[country_code] = row.CumulativePositive
                conn.commit()
            except:
                break
        dt += datetime.timedelta(days=1)

    # debugging
    #print(missing_countries_set)

    with open('jrc_countries.json', 'w') as f:
        f.write(json.dumps(prev_death_dict)+'\n')
        f.write(json.dumps(prev_recovered_dict)+'\n')
        f.write(json.dumps(prev_cases_dict)+'\n')
        f.close()


def jrc_date(dt):
    return str(dt.year) + ('0' if dt.month < 10 else '') + str(dt.month) + ('0' if dt.day < 10 else '') + str(dt.day)


Arshana Jain's avatar
Arshana Jain committed
# JRC includes Italy data, but not the same subsets
Arshana Jain's avatar
Arshana Jain committed
def daily_italy():
Arshana Jain's avatar
Arshana Jain committed
    df_total = pd.read_csv('https://raw.githubusercontent.com/RamiKrispin/covid19Italy/master/csv/italy_total.csv', error_bad_lines=False)
    df_region = pd.read_csv('https://raw.githubusercontent.com/RamiKrispin/covid19Italy/master/csv/italy_region.csv', error_bad_lines=False)
    df_subregion = pd.read_csv('https://raw.githubusercontent.com/RamiKrispin/covid19Italy/master/csv/italy_province.csv', error_bad_lines=False)

Arshana Jain's avatar
Arshana Jain committed
    i = 0
    prev_row = {}
    prev_death_dict = {}
    prev_recovered_dict = {}
    with open('italy.json', 'r') as f:
        for line in f:
            if i == 0:
                prev_row = json.loads(line)
            elif i == 1:
                prev_death_dict = json.loads(line)
            elif i == 2:
Arshana Jain's avatar
Arshana Jain committed
                prev_recovered_dict = json.loads(line)
            i += 1
        f.close()

Arshana Jain's avatar
Arshana Jain committed
    conn = sqlite3.connect('sqlite_db')
Arshana Jain's avatar
Arshana Jain committed
    c = conn.cursor()

    # get country_code
    italy_code = get_country_code("Italy", c)

    # get source id for source
    italy_src_url = "https://github.com/RamiKrispin/covid19italy"
    italy_src = get_source_id(italy_src_url, c)
    
    # insert total
    dt = datetime.datetime.today() - datetime.timedelta(days=2)
Arshana Jain's avatar
Arshana Jain committed
    for i in range(0, 3):
        date = get_italy_date(dt)
        sql = '''SELECT date_collected FROM Cases_Per_Country WHERE date_collected = ? AND source_id = ?'''
        c.execute(sql, (date, italy_src))
        already_entered = c.fetchall() != []
Arshana Jain's avatar
Arshana Jain committed
        if not already_entered:
            country_rows = df_total.loc[df_total['date'] == date]
            for i in range(len(country_rows)):
                row = country_rows.iloc[i]
                prev_death = 0 if "death" not in prev_row else prev_row["death"]
                prev_recovered = 0 if "recovered" not in prev_row else prev_row["recovered"]
                sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers, hospitalization_numbers) VALUES (?, ?, ?, ?, ?, ?, ?)'''
                c.execute(sql,(italy_code, row.date, italy_src, (row.death - prev_death) if isNum(row.death) else None, int(row.daily_positive_cases) if isNum(row.daily_positive_cases) else None, (row.recovered - prev_recovered) if isNum(row.recovered) else None, int(row.total_hospitalized) if isNum(row.total_hospitalized) else None))
                if isNum(row.death):
Arshana Jain's avatar
Arshana Jain committed
                    prev_row["death"] = int(row.death)
                if isNum(row.recovered):
Arshana Jain's avatar
Arshana Jain committed
                    prev_row["recovered"] = int(row.recovered)
Arshana Jain's avatar
Arshana Jain committed
            conn.commit()
        dt += datetime.timedelta(days=1)
Arshana Jain's avatar
Arshana Jain committed
    
    # set up + insert regions
    dt = datetime.datetime.today() - datetime.timedelta(days=2)
Arshana Jain's avatar
Arshana Jain committed
    for i in range(0, 3):
        date = get_italy_date(dt)
        sql = '''SELECT date_collected FROM Cases_Per_Region WHERE date_collected = ? AND source_id = ?'''
        c.execute(sql, (date, italy_src))
        already_entered = c.fetchall() != []
Arshana Jain's avatar
Arshana Jain committed
        if not already_entered:
            region_rows = df_region.loc[df_region['date'] == date]
            for i in range(len(region_rows)):
                row = region_rows.iloc[i]
                region_code = get_region_code(italy_code, row.region_name, c)
                if region_code is None:
                    sql = '''INSERT INTO Regions (region_name, country_code, longitude, latitude) VALUES (?, ?, ?, ?)'''
                    c.execute(sql,(row.region_name, italy_code, row.long, row.lat))
                    conn.commit()
                    region_code = get_region_code(italy_code, row.region_name, c)
                prev_death = 0 if region_code not in prev_death_dict else prev_death_dict[region_code]
                prev_recovered = 0 if region_code not in prev_recovered_dict else prev_recovered_dict[region_code]
                sql = '''INSERT INTO Cases_Per_Region (region_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers, hospitalization_numbers) VALUES (?, ?, ?, ?, ?, ?, ?)'''
                c.execute(sql,(region_code, row.date, italy_src, (row.death - prev_death) if isNum(row.death) else None, int(row.daily_positive_cases) if isNum(row.daily_positive_cases) else None, (row.recovered - prev_recovered) if isNum(row.recovered) else None, int(row.total_hospitalized) if isNum(row.total_hospitalized) else None))
                if isNum(row.death):
Arshana Jain's avatar
Arshana Jain committed
                    prev_death_dict[region_code] = int(row.death)
                if isNum(row.recovered):
Arshana Jain's avatar
Arshana Jain committed
                    prev_recovered_dict[region_code] = int(row.recovered)
Arshana Jain's avatar
Arshana Jain committed
            conn.commit()
        dt += datetime.timedelta(days=1)
Arshana Jain's avatar
Arshana Jain committed

    dt = datetime.datetime.today() - datetime.timedelta(days=2)
Arshana Jain's avatar
Arshana Jain committed
    italy_district_helper(get_italy_date(dt), italy_code, italy_src, df_subregion, c, conn)

    for i in range(0, 2):
        dt += datetime.timedelta(days=1)
Arshana Jain's avatar
Arshana Jain committed
        italy_district_helper(get_italy_date(dt), italy_code, italy_src, df_subregion, c, conn)
            
    conn.close()

    with open('italy.json', 'w') as f:
        f.write(json.dumps(prev_row)+'\n')
        f.write(json.dumps(prev_death_dict)+'\n')
        f.write(json.dumps(prev_recovered_dict)+'\n')
        f.close()


def get_italy_date(dt):
    return str(dt.year)+ '-' + ('0' if dt.month < 10 else '') + str(dt.month) + '-' + ('0' if dt.day < 10 else '') + str(dt.day)


def italy_district_helper(date, italy_code, italy_src, df_subregion, c, conn):
    sql = '''SELECT date_collected FROM Cases_Per_District WHERE date_collected = ? AND source_id = ?'''
    c.execute(sql, (date, italy_src))
    already_entered = c.fetchall() != []
Arshana Jain's avatar
Arshana Jain committed
    if not already_entered:
        subregion_rows = df_subregion.loc[df_subregion['date'] == date]
        for i in range(len(subregion_rows)):
            row = subregion_rows.iloc[i]
            region_code = get_region_code(italy_code, row.region_name, c)
            if region_code is None:
                sql = '''INSERT INTO Regions (region_name, country_code, longitude, latitude) VALUES (?, ?, ?, ?)'''
                c.execute(sql,(row.region_name, italy_code, row.long, row.lat))
                conn.commit()
                region_code = get_region_code(italy_code, row.region_name, c)
            subregion_code = get_district_code(region_code, row.province_name, c)
            if subregion_code is None:
                sql = '''INSERT INTO Districts (district_name, region_code, longitude, latitude) VALUES (?, ?, ?, ?)'''
                c.execute(sql,(row.province_name, region_code, row.long, row.lat))
                conn.commit()
                subregion_code = get_district_code(region_code, row.province_name, c)
            sql = '''INSERT INTO Cases_Per_District (district_code, date_collected, source_id, case_numbers) VALUES (?, ?, ?, ?)'''
            c.execute(sql,(subregion_code, row.date, italy_src, int(row.new_cases)))
        conn.commit()


Arshana Jain's avatar
Arshana Jain committed
# Updates daily. Check for both the last two days' and today's data to make sure they are all gotten regardless of when the source is updated
def daily_ukraine():
Arshana Jain's avatar
Arshana Jain committed
    conn = sqlite3.connect('sqlite_db')
Arshana Jain's avatar
Arshana Jain committed
    c = conn.cursor()

    # get country_code for Ukraine
    ukraine_code = get_country_code("Ukraine", c)

    # get source id for Ukraine source
    ukraine_src_url = "https://github.com/dmytro-derkach/covid-19-ukraine"
    ukraine_src = get_source_id(ukraine_src_url, c)

Arshana Jain's avatar
Arshana Jain committed
    dt = datetime.datetime.today()
Arshana Jain's avatar
Arshana Jain committed
    ukraine_helper(get_ukraine_date(dt), ukraine_code, ukraine_src, c, conn)
Arshana Jain's avatar
Arshana Jain committed

    for i in range(0, 2):
        dt -= datetime.timedelta(days=1)
Arshana Jain's avatar
Arshana Jain committed
        ukraine_helper(get_ukraine_date(dt), ukraine_code, ukraine_src, c, conn)
Arshana Jain's avatar
Arshana Jain committed
def get_ukraine_date(dt):
Arshana Jain's avatar
Arshana Jain committed
    return ('0' if dt.month < 10 else '')  + str(dt.month) + '-' + ('0' if dt.day < 10 else '') + str(dt.day) + '-' + str(dt.year)


def ukraine_helper(date, ukraine_code, ukraine_src, c, conn):
Arshana Jain's avatar
Arshana Jain committed
    try:
Arshana Jain's avatar
Arshana Jain committed
        sql = '''SELECT date_collected FROM Cases_Per_Region WHERE date_collected = ? AND source_id = ?'''
Arshana Jain's avatar
Arshana Jain committed
        c.execute(sql, (date, ukraine_src))
        already_entered = c.fetchall() != []
Arshana Jain's avatar
Arshana Jain committed
        if not already_entered:
            csv_name = 'https://raw.githubusercontent.com/dmytro-derkach/covid-19-ukraine/master/daily_reports/' + date + '.csv'
            df = pd.read_csv(csv_name, error_bad_lines=False)
            for row in df.itertuples():
                region_code = get_region_code(ukraine_code, row.Province_State, c)
                if region_code is None:
                    sql = '''INSERT INTO Regions (region_name, country_code, longitude, latitude) VALUES (?, ?, ?, ?)'''
                    c.execute(sql,(row.Province_State, ukraine_code, row.Long_, row.Lat))
                    conn.commit()
                    region_code = get_region_code(ukraine_code, row.Province_State, c)
                sql = '''INSERT INTO Cases_Per_Region (region_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)'''
                c.execute(sql,(region_code, date, ukraine_src, row.Deaths_delta, row.Confirmed_delta, row.Recovered_delta))
            conn.commit()
Arshana Jain's avatar
Arshana Jain committed
    except:
Arshana Jain's avatar
Arshana Jain committed
        pass