Skip to content
Snippets Groups Projects
Commit 2fd2d1a8 authored by Ivy Wang's avatar Ivy Wang
Browse files

Merge branch 'main' of https://github.com/arshana/COVID-19-Data-Integration into main

parents c7c1b37c 6ed871c4
No related branches found
No related tags found
No related merge requests found
...@@ -49,14 +49,14 @@ Christmas Island,CX ...@@ -49,14 +49,14 @@ Christmas Island,CX
Cocos (Keeling) Islands,CC Cocos (Keeling) Islands,CC
Colombia,CO Colombia,CO
Comoros,KM Comoros,KM
Congo,CG Congo-Brazzaville,CG
"Congo, the Democratic Republic of the",CD Congo-Kinshasa,CD
Cook Islands,CK Cook Islands,CK
Costa Rica,CR Costa Rica,CR
Cote d'Ivoire,CI Cote d'Ivoire,CI
Croatia,HR Croatia,HR
Cuba,CU Cuba,CU
Curaçao,CW Curacao,CW
Cyprus,CY Cyprus,CY
Czech Republic,CZ Czech Republic,CZ
Denmark,DK Denmark,DK
...@@ -165,6 +165,7 @@ Niger,NE ...@@ -165,6 +165,7 @@ Niger,NE
Nigeria,NG Nigeria,NG
Niue,NU Niue,NU
Norfolk Island,NF Norfolk Island,NF
North Ireland,XI
Northern Mariana Islands,MP Northern Mariana Islands,MP
Norway,NO Norway,NO
Oman,OM Oman,OM
...@@ -181,7 +182,7 @@ Poland,PL ...@@ -181,7 +182,7 @@ Poland,PL
Portugal,PT Portugal,PT
Puerto Rico,PR Puerto Rico,PR
Qatar,QA Qatar,QA
Réunion,RE Reunion,RE
Romania,RO Romania,RO
Russia,RU Russia,RU
Rwanda,RW Rwanda,RW
......
import pandas as pd import pandas as pd
import sqlite3 import sqlite3
import datetime import datetime
import sys
from util import * from util import *
...@@ -81,16 +82,163 @@ def init_jhu_us_states(c, conn, src_id): ...@@ -81,16 +82,163 @@ def init_jhu_us_states(c, conn, src_id):
# ONLY SAFE TO CALL FROM init_jhu in this state (otherwise consider that source may be replicated, etc.) # ONLY SAFE TO CALL FROM init_jhu in this state (otherwise consider that source may be replicated, etc.)
# First csv: 01-22-2020 # First csv: 01-22-2020
def init_jhu_global(c, conn, src_id): def init_jhu_global(c, conn, src_id):
# get country_code dt = datetime.datetime(2020, 1, 22)
us_code = get_country_code("United States", c) missing_countries_set = set(()) # used to keep track of any countries that might need to be added to the countries table - for debugging purposes
# intentionally selected this csv compared to some of the others to ensure all rows are covered # can be used for country and region codes since they are unique from each other
setup_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/02-27-2022.csv") prev_death_dict = {}
prev_recovered_dict = {}
prev_case_dict = {}
# for subregion since codes overlap with region
prev_death_dict_subregion = {}
prev_recovered_dict_subregion = {}
prev_case_dict_subregion = {}
# TODO test again after the Namibia issue from prototype_main_backend is fixed
i = 0
while (i < 3):
date = ('0' if dt.month < 10 else '') + str(dt.month) + '-' + ('0' if dt.day < 10 else '') + str(dt.day) + '-' + str(dt.year)
csv_name = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/' + date + '.csv'
try:
df = pd.read_csv(csv_name, error_bad_lines=False)
for row in df.itertuples():
# normalize country name
country_name = None
if "Country_Region" in df.columns:
country_name = str(row.Country_Region).strip()
else:
country_name = str(row._2).strip()
if (country_name == "Burma"):
country_name = "Myanmar"
elif (country_name == "Czechia"):
country_name = "Czech Republic"
elif (country_name == "Taiwan*"):
country_name = "Taiwan"
elif (country_name == "Korea, South"):
country_name = "South Korea"
elif (country_name == "US"):
country_name = "United States"
elif (country_name == "Congo (Brazzaville)" or country_name == "Republic of the Congo"):
country_name = "Congo-Brazzaville"
elif (country_name == "Congo (Kinshasa)"):
country_name = "Congo-Kinshasa"
elif (country_name == "Mainland China"):
country_name = "China"
elif (country_name == "Macau" or country_name == "Macao SAR"):
country_name = "Macao"
elif (country_name == "Bahamas, The" or country_name == "The Bahamas"):
country_name = "Bahamas"
elif (country_name == "Republic of Korea"):
country_name = "South Korea"
elif (country_name == "The Gambia" or country_name == "Gambia, The"):
country_name = "Gambia"
elif (country_name == "Ivory Coast"):
country_name = "Cote d'Ivoire"
elif (country_name == "Hong Kong SAR"):
country_name = "Hong Kong"
elif (country_name == "Republic of Ireland"):
country_name = "Ireland"
elif (country_name == "East Timor"):
country_name = "Timor-Leste"
elif (country_name == "Russian Federation"):
country_name = "Russia"
elif (country_name == "Republic of Moldova"):
country_name = "Moldova"
elif (country_name == "Iran (Islamic Republic of)"):
country_name = "Iran"
elif (country_name == "Viet Nam"):
country_name = "Vietnam"
elif (country_name == "Cape Verde"):
country_name = "Cabo Verde"
elif (country_name == "Vatican City"):
country_name = "Holy See"
elif (country_name == "UK"):
country_name = "United Kingdom"
country_code = get_country_code(country_name, c)
if country_code is None:
missing_countries_set.add(country_name)
else:
region_name = None
if "Province_State" in df.columns:
region_name = str(row.Province_State).strip()
else:
region_name = str(row._1).strip()
if (region_name is None or str(region_name).lower() == "nan"): # a country-level entry
prev_death = 0 if country_code not in prev_death_dict else prev_death_dict[country_code]
prev_recovered = 0 if country_code not in prev_recovered_dict else prev_recovered_dict[country_code]
prev_case = 0 if country_code not in prev_case_dict else prev_case_dict[country_code]
sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)'''
# handles the case of a blank column by inserting None
c.execute(sql,(country_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None))
# update previous
if row.Deaths is not None:
prev_death_dict[country_code] = row.Deaths
if row.Recovered is not None:
prev_recovered_dict[country_code] = row.Recovered
if row.Confirmed is not None:
prev_case_dict[country_code] = row.Confirmed
elif (region_name != "Recovered" and region_name != "Unknown"): # a region-level entry
# skip Recovered row - irrelevant data - be on the look out for other special cases that haven't been noticed yet
region_code = get_region_code(str(country_code), str(region_name), c)
if region_code is None:
sql = '''INSERT INTO Regions (region_name, country_code, longitude, latitude) VALUES (?, ?, ?, ?)'''
c.execute(sql,(str(region_name), country_code, row.Long_ if 'Long_' in df.columns else None, row.Lat if 'Lat' in df.columns else None))
conn.commit()
region_code = get_region_code(str(country_code), str(region_name), c)
# change Burma to Myanmar subregion_name = None
# change Czechia to Czech Republic if "Admin2" in df.columns:
# if country not in Countries, skip subregion_name = str(row.Admin2).strip()
# change Taiwan* to Taiwan
# change Korea, South to "Korea, Republic of" if (subregion_name is None or str(subregion_name).lower() == "nan"):
# what's up with Namibia? prev_death = 0 if region_code not in prev_death_dict else prev_death_dict[region_code]
prev_recovered = 0 if region_code not in prev_recovered_dict else prev_recovered_dict[region_code]
prev_case = 0 if region_code not in prev_case_dict else prev_case_dict[region_code]
sql = '''INSERT INTO Cases_Per_Region (region_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)'''
# handles the case of a blank column by inserting None
c.execute(sql,(region_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None))
# update previous
if row.Deaths is not None:
prev_death_dict[region_code] = row.Deaths
if row.Recovered is not None:
prev_recovered_dict[region_code] = row.Recovered
if row.Confirmed is not None:
prev_case_dict[region_code] = row.Confirmed
elif (subregion_name != "Unassigned"):
subregion_code = get_district_code(region_code, str(subregion_name), c)
if subregion_code is None:
sql = '''INSERT INTO Districts (district_name, region_code, longitude, latitude) VALUES (?, ?, ?, ?)'''
c.execute(sql,(str(subregion_name), region_code, row.Long_ if 'Long_' in df.columns else None, row.Lat if 'Lat' in df.columns else None))
conn.commit()
subregion_code = get_district_code(region_code, str(subregion_name), c)
prev_death = 0 if subregion_code not in prev_death_dict_subregion else prev_death_dict_subregion[subregion_code]
prev_recovered = 0 if subregion_code not in prev_recovered_dict_subregion else prev_recovered_dict_subregion[subregion_code]
prev_case = 0 if subregion_code not in prev_case_dict_subregion else prev_case_dict_subregion[subregion_code]
sql = '''INSERT INTO Cases_Per_District (district_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)'''
# handles the case of a blank column by inserting None
c.execute(sql,(subregion_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None))
# update previous
if row.Deaths is not None:
prev_death_dict_subregion[subregion_code] = row.Deaths
if row.Recovered is not None:
prev_recovered_dict_subregion[subregion_code] = row.Recovered
if row.Confirmed is not None:
prev_case_dict_subregion[subregion_code] = row.Confirmed
conn.commit() # runs after every csv
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
# debugging
#print(exc_tb.tb_lineno)
#print(e)
break
dt += datetime.timedelta(days=1)
# debugging
#print(missing_countries_set)
\ No newline at end of file
# JHU data for US states is handled in init_global.py
\ No newline at end of file
...@@ -4,6 +4,7 @@ import pandas as pd ...@@ -4,6 +4,7 @@ import pandas as pd
conn = sqlite3.connect('prototype_db') conn = sqlite3.connect('prototype_db')
c = conn.cursor() c = conn.cursor()
# TODO Namibia's country code is currently being read as NULL; wait for SQLServer and then debug
c.execute(''' c.execute('''
CREATE TABLE Countries( CREATE TABLE Countries(
country_code VARCHAR(2) PRIMARY KEY, country_code VARCHAR(2) PRIMARY KEY,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment