diff --git a/country_countryCode.csv b/country_countryCode.csv index c9444db7ef0e1163fe1c7ddb034edb7804578718..9c4af11bd4ee3fa0737014c61821b1adcb85aa1a 100644 --- a/country_countryCode.csv +++ b/country_countryCode.csv @@ -49,14 +49,14 @@ Christmas Island,CX Cocos (Keeling) Islands,CC Colombia,CO Comoros,KM -Congo,CG -"Congo, the Democratic Republic of the",CD +Congo-Brazzaville,CG +Congo-Kinshasa,CD Cook Islands,CK Costa Rica,CR Cote d'Ivoire,CI Croatia,HR Cuba,CU -Curaçao,CW +Curacao,CW Cyprus,CY Czech Republic,CZ Denmark,DK @@ -165,6 +165,7 @@ Niger,NE Nigeria,NG Niue,NU Norfolk Island,NF +North Ireland,XI Northern Mariana Islands,MP Norway,NO Oman,OM @@ -181,7 +182,7 @@ Poland,PL Portugal,PT Puerto Rico,PR Qatar,QA -Réunion,RE +Reunion,RE Romania,RO Russia,RU Rwanda,RW diff --git a/initial_data_scripts/init_global.py b/initial_data_scripts/init_global.py index e71d0c5fe49cefb8f80b61f99251689de8d0f6a3..f043195e086580b32d9a9784ae0058d480daad3c 100644 --- a/initial_data_scripts/init_global.py +++ b/initial_data_scripts/init_global.py @@ -1,6 +1,7 @@ import pandas as pd import sqlite3 import datetime +import sys from util import * @@ -81,16 +82,163 @@ def init_jhu_us_states(c, conn, src_id): # ONLY SAFE TO CALL FROM init_jhu in this state (otherwise consider that source may be replicated, etc.) # First csv: 01-22-2020 def init_jhu_global(c, conn, src_id): - # get country_code - us_code = get_country_code("United States", c) + dt = datetime.datetime(2020, 1, 22) + missing_countries_set = set(()) # used to keep track of any countries that might need to be added to the countries table - for debugging purposes - # intentionally selected this csv compared to some of the others to ensure all rows are covered - setup_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/02-27-2022.csv") + # can be used for country and region codes since they are unique from each other + prev_death_dict = {} + prev_recovered_dict = {} + prev_case_dict = {} + + # for subregion since codes overlap with region + prev_death_dict_subregion = {} + prev_recovered_dict_subregion = {} + prev_case_dict_subregion = {} + + # TODO test again after the Namibia issue from prototype_main_backend is fixed + i = 0 + while (i < 3): + date = ('0' if dt.month < 10 else '') + str(dt.month) + '-' + ('0' if dt.day < 10 else '') + str(dt.day) + '-' + str(dt.year) + csv_name = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/' + date + '.csv' + try: + df = pd.read_csv(csv_name, error_bad_lines=False) + for row in df.itertuples(): + # normalize country name + country_name = None + if "Country_Region" in df.columns: + country_name = str(row.Country_Region).strip() + else: + country_name = str(row._2).strip() + + if (country_name == "Burma"): + country_name = "Myanmar" + elif (country_name == "Czechia"): + country_name = "Czech Republic" + elif (country_name == "Taiwan*"): + country_name = "Taiwan" + elif (country_name == "Korea, South"): + country_name = "South Korea" + elif (country_name == "US"): + country_name = "United States" + elif (country_name == "Congo (Brazzaville)" or country_name == "Republic of the Congo"): + country_name = "Congo-Brazzaville" + elif (country_name == "Congo (Kinshasa)"): + country_name = "Congo-Kinshasa" + elif (country_name == "Mainland China"): + country_name = "China" + elif (country_name == "Macau" or country_name == "Macao SAR"): + country_name = "Macao" + elif (country_name == "Bahamas, The" or country_name == "The Bahamas"): + country_name = "Bahamas" + elif (country_name == "Republic of Korea"): + country_name = "South Korea" + elif (country_name == "The Gambia" or country_name == "Gambia, The"): + country_name = "Gambia" + elif (country_name == "Ivory Coast"): + country_name = "Cote d'Ivoire" + elif (country_name == "Hong Kong SAR"): + country_name = "Hong Kong" + elif (country_name == "Republic of Ireland"): + country_name = "Ireland" + elif (country_name == "East Timor"): + country_name = "Timor-Leste" + elif (country_name == "Russian Federation"): + country_name = "Russia" + elif (country_name == "Republic of Moldova"): + country_name = "Moldova" + elif (country_name == "Iran (Islamic Republic of)"): + country_name = "Iran" + elif (country_name == "Viet Nam"): + country_name = "Vietnam" + elif (country_name == "Cape Verde"): + country_name = "Cabo Verde" + elif (country_name == "Vatican City"): + country_name = "Holy See" + elif (country_name == "UK"): + country_name = "United Kingdom" + + country_code = get_country_code(country_name, c) + + if country_code is None: + missing_countries_set.add(country_name) + else: + region_name = None + if "Province_State" in df.columns: + region_name = str(row.Province_State).strip() + else: + region_name = str(row._1).strip() + + if (region_name is None or str(region_name).lower() == "nan"): # a country-level entry + prev_death = 0 if country_code not in prev_death_dict else prev_death_dict[country_code] + prev_recovered = 0 if country_code not in prev_recovered_dict else prev_recovered_dict[country_code] + prev_case = 0 if country_code not in prev_case_dict else prev_case_dict[country_code] + sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)''' + # handles the case of a blank column by inserting None + c.execute(sql,(country_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None)) + # update previous + if row.Deaths is not None: + prev_death_dict[country_code] = row.Deaths + if row.Recovered is not None: + prev_recovered_dict[country_code] = row.Recovered + if row.Confirmed is not None: + prev_case_dict[country_code] = row.Confirmed + elif (region_name != "Recovered" and region_name != "Unknown"): # a region-level entry + # skip Recovered row - irrelevant data - be on the look out for other special cases that haven't been noticed yet + region_code = get_region_code(str(country_code), str(region_name), c) + if region_code is None: + sql = '''INSERT INTO Regions (region_name, country_code, longitude, latitude) VALUES (?, ?, ?, ?)''' + c.execute(sql,(str(region_name), country_code, row.Long_ if 'Long_' in df.columns else None, row.Lat if 'Lat' in df.columns else None)) + conn.commit() + region_code = get_region_code(str(country_code), str(region_name), c) - # change Burma to Myanmar - # change Czechia to Czech Republic - # if country not in Countries, skip - # change Taiwan* to Taiwan - # change Korea, South to "Korea, Republic of" - # what's up with Namibia? + subregion_name = None + if "Admin2" in df.columns: + subregion_name = str(row.Admin2).strip() + + if (subregion_name is None or str(subregion_name).lower() == "nan"): + prev_death = 0 if region_code not in prev_death_dict else prev_death_dict[region_code] + prev_recovered = 0 if region_code not in prev_recovered_dict else prev_recovered_dict[region_code] + prev_case = 0 if region_code not in prev_case_dict else prev_case_dict[region_code] + sql = '''INSERT INTO Cases_Per_Region (region_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)''' + # handles the case of a blank column by inserting None + c.execute(sql,(region_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None)) + # update previous + if row.Deaths is not None: + prev_death_dict[region_code] = row.Deaths + if row.Recovered is not None: + prev_recovered_dict[region_code] = row.Recovered + if row.Confirmed is not None: + prev_case_dict[region_code] = row.Confirmed + elif (subregion_name != "Unassigned"): + subregion_code = get_district_code(region_code, str(subregion_name), c) + if subregion_code is None: + sql = '''INSERT INTO Districts (district_name, region_code, longitude, latitude) VALUES (?, ?, ?, ?)''' + c.execute(sql,(str(subregion_name), region_code, row.Long_ if 'Long_' in df.columns else None, row.Lat if 'Lat' in df.columns else None)) + conn.commit() + subregion_code = get_district_code(region_code, str(subregion_name), c) + + prev_death = 0 if subregion_code not in prev_death_dict_subregion else prev_death_dict_subregion[subregion_code] + prev_recovered = 0 if subregion_code not in prev_recovered_dict_subregion else prev_recovered_dict_subregion[subregion_code] + prev_case = 0 if subregion_code not in prev_case_dict_subregion else prev_case_dict_subregion[subregion_code] + sql = '''INSERT INTO Cases_Per_District (district_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)''' + # handles the case of a blank column by inserting None + c.execute(sql,(subregion_code, date, src_id, row.Deaths - prev_death if row.Deaths is not None else None, row.Confirmed - prev_case if row.Confirmed is not None else None, row.Recovered - prev_recovered if row.Recovered is not None else None)) + # update previous + if row.Deaths is not None: + prev_death_dict_subregion[subregion_code] = row.Deaths + if row.Recovered is not None: + prev_recovered_dict_subregion[subregion_code] = row.Recovered + if row.Confirmed is not None: + prev_case_dict_subregion[subregion_code] = row.Confirmed + + conn.commit() # runs after every csv + except Exception as e: + exc_type, exc_obj, exc_tb = sys.exc_info() + # debugging + #print(exc_tb.tb_lineno) + #print(e) + break + dt += datetime.timedelta(days=1) + # debugging + #print(missing_countries_set) \ No newline at end of file diff --git a/initial_data_scripts/init_north_america.py b/initial_data_scripts/init_north_america.py new file mode 100644 index 0000000000000000000000000000000000000000..f01a74582c0b350a848e319fbd240053e37753a6 --- /dev/null +++ b/initial_data_scripts/init_north_america.py @@ -0,0 +1 @@ +# JHU data for US states is handled in init_global.py \ No newline at end of file diff --git a/prototype_main_backend.py b/prototype_main_backend.py index eedb966ad208a481bc295b2538c8888ec0ce2aaa..850fb7355c719c72c7ccc8236ffa76b39f05443e 100644 --- a/prototype_main_backend.py +++ b/prototype_main_backend.py @@ -4,6 +4,7 @@ import pandas as pd conn = sqlite3.connect('prototype_db') c = conn.cursor() +# TODO Namibia's country code is currently being read as NULL; wait for SQLServer and then debug c.execute(''' CREATE TABLE Countries( country_code VARCHAR(2) PRIMARY KEY,