Skip to content
Snippets Groups Projects
Commit 005db1e8 authored by Arshana Jain's avatar Arshana Jain
Browse files

rough draft jhu daily

parent 777a39fc
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import sqlite3
import datetime
import sys
import json
from util import *
def daily_jhu():
conn = sqlite3.connect('prototype_db')
c = conn.cursor()
# insert and get source id for source
src_url = "https://github.com/CSSEGISandData/COVID-19"
set_source(src_url, c, conn)
src_id = get_source_id(src_url, c)
daily_jhu_us_states(c, conn, src_id)
#daily_jhu_global(c, conn, src_id)
conn.close()
# US States from JHU data
# ONLY SAFE TO CALL FROM daily_jhu in this state
def daily_jhu_us_states(c, conn, src_id):
i = 0
prev_death_dict = {}
prev_recovered_dict = {}
prev_case_dict = {}
prev_hospitalized_dict = {}
with open('jhu_us.json', 'r') as f:
for line in f:
if i == 0:
prev_death_dict = json.loads(line)
elif i == 1:
prev_recovered_dict = json.loads(line)
elif i == 2:
prev_case_dict = json.loads(line)
elif i == 3:
prev_hospitalized_dict = json.loads(line)
i += 1
f.close()
# get country_code
us_code = get_country_code("United States", c)
# insert state data in Cases_per_Region
# the data is cumulative - need the previous data to accurately update the new data
dt = datetime.datetime.today() - datetime.timedelta(days=2)
last_error = ""
for i in range(0, 3):
date = ('0' if dt.month < 10 else '') + str(dt.month) + '-' + ('0' if dt.day < 10 else '') + str(dt.day) + '-' + str(dt.year)
sql = '''SELECT date_collected FROM Cases_Per_Region WHERE date_collected = ? AND source_id = ?'''
c.execute(sql, (date, src_id))
already_entered = c.fetchall() == []
if not already_entered:
csv_name = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/' + date + '.csv'
try:
df = pd.read_csv(csv_name, error_bad_lines=False)
for row in df.itertuples():
region_code = get_region_code(us_code, row.Province_State, c)
prev_death = 0 if region_code not in prev_death_dict else prev_death_dict[region_code]
prev_recovered = 0 if region_code not in prev_recovered_dict else prev_recovered_dict[region_code]
prev_case = 0 if region_code not in prev_case_dict else prev_case_dict[region_code]
prev_hospitalized = 0 if region_code not in prev_hospitalized_dict else prev_hospitalized_dict[region_code]
if region_code is not None:
sql = '''INSERT INTO Cases_Per_Region (region_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers, hospitalization_numbers) VALUES (?, ?, ?, ?, ?, ?, ?)'''
# handles the case of a blank column by inserting None
c.execute(sql,(region_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None, row.People_Hospitalized - prev_hospitalized if row.People_Hospitalized is not None else None))
# update previous
if row.Deaths is not None:
prev_death_dict[region_code] = row.Deaths
if row.Recovered is not None:
prev_recovered_dict[region_code] = row.Recovered
if row.Confirmed is not None:
prev_case_dict[region_code] = row.Confirmed
if row.People_Hospitalized is not None:
prev_hospitalized_dict[region_code] = row.People_Hospitalized
else:
last_error = (row.Province_State + " was missing from the Regions table - init_jhu_us_states " + csv_name + ".")
conn.commit()
except:
break
dt += datetime.timedelta(days=1)
print(last_error)
with open('jhu_us.json', 'w') as f:
f.write(json.dumps(prev_death_dict)+'\n')
f.write(json.dumps(prev_recovered_dict)+'\n')
f.write(json.dumps(prev_case_dict)+'\n')
f.write(json.dumps(prev_hospitalized_dict)+'\n')
f.close()
# Global JHU data
# ONLY SAFE TO CALL FROM init_jhu in this state (otherwise consider that source may be replicated, etc.)
# First csv: 01-22-2020
def init_jhu_global(c, conn, src_id):
missing_countries_set = set(()) # used to keep track of any countries that might need to be added to the countries table - for debugging purposes
# can be used for country and region codes since they are unique from each other
prev_death_dict = {}
prev_recovered_dict = {}
prev_case_dict = {}
# for subregion since codes overlap with region
prev_death_dict_subregion = {}
prev_recovered_dict_subregion = {}
prev_case_dict_subregion = {}
with open('jhu_global.json', 'r') as f:
for line in f:
if i == 0:
prev_death_dict = json.loads(line)
elif i == 1:
prev_recovered_dict = json.loads(line)
elif i == 2:
prev_case_dict = json.loads(line)
elif i == 3:
prev_death_dict_subregion = json.loads(line)
elif i == 4:
prev_recovered_dict_subregion = json.loads(line)
elif i == 5:
prev_case_dict_subregion = json.loads(line)
i += 1
f.close()
# TODO test again after the Namibia issue from prototype_main_backend is fixed
dt = datetime.datetime.today() - datetime.timedelta(days=2)
for i in range(0, 3):
date = ('0' if dt.month < 10 else '') + str(dt.month) + '-' + ('0' if dt.day < 10 else '') + str(dt.day) + '-' + str(dt.year)
sql = '''SELECT date_collected FROM Cases_Per_Region WHERE date_collected = ? AND source_id = ?'''
c.execute(sql, (date, src_id))
already_entered = c.fetchall() == []
if not already_entered:
csv_name = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/' + date + '.csv'
try:
df = pd.read_csv(csv_name, error_bad_lines=False)
for row in df.itertuples():
# normalize country name
country_name = None
if "Country_Region" in df.columns:
country_name = str(row.Country_Region).strip()
else:
country_name = str(row._2).strip()
if (country_name == "Burma"):
country_name = "Myanmar"
elif (country_name == "Czechia"):
country_name = "Czech Republic"
elif (country_name == "Taiwan*"):
country_name = "Taiwan"
elif (country_name == "Korea, South"):
country_name = "South Korea"
elif (country_name == "US"):
country_name = "United States"
elif (country_name == "Congo (Brazzaville)" or country_name == "Republic of the Congo"):
country_name = "Congo-Brazzaville"
elif (country_name == "Congo (Kinshasa)"):
country_name = "Congo-Kinshasa"
elif (country_name == "Mainland China"):
country_name = "China"
elif (country_name == "Macau" or country_name == "Macao SAR"):
country_name = "Macao"
elif (country_name == "Bahamas, The" or country_name == "The Bahamas"):
country_name = "Bahamas"
elif (country_name == "Republic of Korea"):
country_name = "South Korea"
elif (country_name == "The Gambia" or country_name == "Gambia, The"):
country_name = "Gambia"
elif (country_name == "Ivory Coast"):
country_name = "Cote d'Ivoire"
elif (country_name == "Hong Kong SAR"):
country_name = "Hong Kong"
elif (country_name == "Republic of Ireland"):
country_name = "Ireland"
elif (country_name == "East Timor"):
country_name = "Timor-Leste"
elif (country_name == "Russian Federation"):
country_name = "Russia"
elif (country_name == "Republic of Moldova"):
country_name = "Moldova"
elif (country_name == "Iran (Islamic Republic of)"):
country_name = "Iran"
elif (country_name == "Viet Nam"):
country_name = "Vietnam"
elif (country_name == "Cape Verde"):
country_name = "Cabo Verde"
elif (country_name == "Vatican City"):
country_name = "Holy See"
elif (country_name == "UK"):
country_name = "United Kingdom"
country_code = get_country_code(country_name, c)
if country_code is None:
missing_countries_set.add(country_name)
else:
region_name = None
if "Province_State" in df.columns:
region_name = str(row.Province_State).strip()
else:
region_name = str(row._1).strip()
if (region_name is None or str(region_name).lower() == "nan"): # a country-level entry
prev_death = 0 if country_code not in prev_death_dict else prev_death_dict[country_code]
prev_recovered = 0 if country_code not in prev_recovered_dict else prev_recovered_dict[country_code]
prev_case = 0 if country_code not in prev_case_dict else prev_case_dict[country_code]
sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)'''
# handles the case of a blank column by inserting None
c.execute(sql,(country_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None))
# update previous
if row.Deaths is not None:
prev_death_dict[country_code] = row.Deaths
if row.Recovered is not None:
prev_recovered_dict[country_code] = row.Recovered
if row.Confirmed is not None:
prev_case_dict[country_code] = row.Confirmed
elif (region_name != "Recovered" and region_name != "Unknown"): # a region-level entry
# skip Recovered row - irrelevant data - be on the look out for other special cases that haven't been noticed yet
region_code = get_region_code(str(country_code), str(region_name), c)
if region_code is None:
sql = '''INSERT INTO Regions (region_name, country_code, longitude, latitude) VALUES (?, ?, ?, ?)'''
c.execute(sql,(str(region_name), country_code, row.Long_ if 'Long_' in df.columns else None, row.Lat if 'Lat' in df.columns else None))
conn.commit()
region_code = get_region_code(str(country_code), str(region_name), c)
subregion_name = None
if "Admin2" in df.columns:
subregion_name = str(row.Admin2).strip()
if (subregion_name is None or str(subregion_name).lower() == "nan"):
prev_death = 0 if region_code not in prev_death_dict else prev_death_dict[region_code]
prev_recovered = 0 if region_code not in prev_recovered_dict else prev_recovered_dict[region_code]
prev_case = 0 if region_code not in prev_case_dict else prev_case_dict[region_code]
sql = '''INSERT INTO Cases_Per_Region (region_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)'''
# handles the case of a blank column by inserting None
c.execute(sql,(region_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None))
# update previous
if row.Deaths is not None:
prev_death_dict[region_code] = row.Deaths
if row.Recovered is not None:
prev_recovered_dict[region_code] = row.Recovered
if row.Confirmed is not None:
prev_case_dict[region_code] = row.Confirmed
elif (subregion_name != "Unassigned"):
subregion_code = get_district_code(region_code, str(subregion_name), c)
if subregion_code is None:
sql = '''INSERT INTO Districts (district_name, region_code, longitude, latitude) VALUES (?, ?, ?, ?)'''
c.execute(sql,(str(subregion_name), region_code, row.Long_ if 'Long_' in df.columns else None, row.Lat if 'Lat' in df.columns else None))
conn.commit()
subregion_code = get_district_code(region_code, str(subregion_name), c)
prev_death = 0 if subregion_code not in prev_death_dict_subregion else prev_death_dict_subregion[subregion_code]
prev_recovered = 0 if subregion_code not in prev_recovered_dict_subregion else prev_recovered_dict_subregion[subregion_code]
prev_case = 0 if subregion_code not in prev_case_dict_subregion else prev_case_dict_subregion[subregion_code]
sql = '''INSERT INTO Cases_Per_District (district_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)'''
# handles the case of a blank column by inserting None
c.execute(sql,(subregion_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None))
# update previous
if row.Deaths is not None:
prev_death_dict_subregion[subregion_code] = row.Deaths
if row.Recovered is not None:
prev_recovered_dict_subregion[subregion_code] = row.Recovered
if row.Confirmed is not None:
prev_case_dict_subregion[subregion_code] = row.Confirmed
conn.commit() # runs after every csv
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
# debugging
#print(exc_tb.tb_lineno)
#print(e)
break
dt += datetime.timedelta(days=1)
# debugging
#print(missing_countries_set)
with open('jhu_global.json', 'w') as f:
f.write(json.dumps(prev_death_dict)+'\n')
f.write(json.dumps(prev_recovered_dict)+'\n')
f.write(json.dumps(prev_case_dict)+'\n')
f.write(json.dumps(prev_death_dict_subregion)+'\n')
f.write(json.dumps(prev_recovered_dict_subregion)+'\n')
f.write(json.dumps(prev_case_dict_subregion)+'\n')
f.close()
\ No newline at end of file
......@@ -2,6 +2,7 @@ import pandas as pd
import sqlite3
import datetime
import sys
import json
from util import *
......@@ -78,6 +79,13 @@ def init_jhu_us_states(c, conn, src_id):
print(last_error)
with open('jhu_us.json', 'w') as f:
f.write(json.dumps(prev_death_dict)+'\n')
f.write(json.dumps(prev_recovered_dict)+'\n')
f.write(json.dumps(prev_case_dict)+'\n')
f.write(json.dumps(prev_hospitalized_dict)+'\n')
f.close()
# Global JHU data
# ONLY SAFE TO CALL FROM init_jhu in this state (otherwise consider that source may be replicated, etc.)
# First csv: 01-22-2020
......@@ -96,8 +104,7 @@ def init_jhu_global(c, conn, src_id):
prev_case_dict_subregion = {}
# TODO test again after the Namibia issue from prototype_main_backend is fixed
i = 0
while (i < 3):
while (True):
date = ('0' if dt.month < 10 else '') + str(dt.month) + '-' + ('0' if dt.day < 10 else '') + str(dt.day) + '-' + str(dt.year)
csv_name = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/' + date + '.csv'
try:
......@@ -241,4 +248,12 @@ def init_jhu_global(c, conn, src_id):
dt += datetime.timedelta(days=1)
# debugging
#print(missing_countries_set)
\ No newline at end of file
#print(missing_countries_set)
with open('jhu_global.json', 'w') as f:
f.write(json.dumps(prev_death_dict)+'\n')
f.write(json.dumps(prev_recovered_dict)+'\n')
f.write(json.dumps(prev_case_dict)+'\n')
f.write(json.dumps(prev_death_dict_subregion)+'\n')
f.write(json.dumps(prev_recovered_dict_subregion)+'\n')
f.write(json.dumps(prev_case_dict_subregion)+'\n')
f.close()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment