From 3360c8de84aeccadb9ad839020f9f7ca9d999cef Mon Sep 17 00:00:00 2001 From: "Fanchong(Ivy) Wang" <fw29@cs.washington.edu> Date: Thu, 3 Mar 2022 02:28:02 -0800 Subject: [PATCH] add init and update for us(country and county level case data and country and state level vaccination data --- daily_data_scripts/daily_na.py | 112 +++++++++++++++++++ initial_data_scripts/init_north_america.py | 122 ++++++++++++++++++++- prototype_main_backend.py | 2 +- util.py | 2 +- 4 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 daily_data_scripts/daily_na.py diff --git a/daily_data_scripts/daily_na.py b/daily_data_scripts/daily_na.py new file mode 100644 index 0000000..eea73c2 --- /dev/null +++ b/daily_data_scripts/daily_na.py @@ -0,0 +1,112 @@ +import pandas as pd +import sqlite3 +import sys +import datetime +import requests +from datetime import date +from datetime import datetime + +sys.path.append("..") +from util import * + +#install html parse tool +from urllib.request import urlopen +!pip install beautifulsoup4 +from bs4 import BeautifulSoup +from urllib.request import urlopen + +#update country and county level case data and vaccination data for country and state +def update_us(): + conn = sqlite3.connect('prototype_db') + c = conn.cursor() + + + # get country_code for US + us_code = get_country_code("United States", c) + + #get source id for US data + us_src_url = "https://github.com/nytimes/covid-19-data" + us_src = get_source_id(us_src_url, c) + us_src_v = "https://covid.cdc.gov/covid-data-tracker/#datatracker-home" + us_src_v = get_source_id(us_src_v, c) + + us_country = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv") + #us_state = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-states.csv") + #just use recent data for counties otherwise too large(can change later) + us_county = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-recent.csv") + us_sv = pd.read_csv("https://data.cdc.gov/api/views/rh2h-3yt2/rows.csv") + + #insert data for US + us_country = us_country[::-1] + for index, row in us_country.iterrows(): + date1 = row['date'] + c.execute('SELECT * FROM Cases_Per_Country WHERE country_code ="' + us_code + '" AND date_collected ="' + str(date1)+ '"') + result = c.fetchall() + if len(result) == 0: + sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers) VALUES (?, ?, ?, ?, ?)''' + c.execute(sql,(us_code, row["date"], us_src, row["deaths"], row["cases"])) + else: + break + conn.commit() + + region_dict = {} + #get state code for US + c.execute("SELECT region_code, region_name from Regions Where country_code = 'US'") + result = c.fetchall() + + for i in range(0,len(result)): + region_dict[result[i][1]] = result[i][0] + + #insert county code and data + county_dict = {} + us_county = us_county[::-1] + for index, row in us_county.iterrows(): + state = row["state"] + county = row["county"] + print(region_dict[state], county) + county_code = get_district_code(region_dict[state], county, c) + date1 = row['date'] + c.execute('SELECT * FROM Cases_Per_District WHERE district_code' + str(county_code) + 'AND date_collected ="' + str(date1)+ '"') + result = c.fetchall() + if len(result) == 0: + sql = '''INSERT INTO Cases_Per_District (district_code, date_collected, source_id, death_numbers, case_numbers) VALUES (?, ?, ?, ?, ?)''' + c.execute(sql,(county_code, row["date"], us_src, row["deaths"], row["cases"])) + else: + break + conn.commit() + + #get abb + abb = {} + wikiurl="https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#cite_note-:0-18" + table_class="wikitable sortable jquery-tablesorter" + response=requests.get(wikiurl) + soup = BeautifulSoup(response.text, 'html.parser') + table = soup.find_all('table',{'class':"wikitable"}) + us_abb = pd.read_html(str(table)) + for i in range(0, 3): + us_abb1 = pd.DataFrame(us_abb[i]) + for index, row in us_abb1.iterrows(): + state = row[0] + state = state.replace("[D]", "") + state = state.replace("U.S. ", "") + abb[row[1]] = state + abb["PW"] = "Palau" + abb["MH"] = "Marshall Islands" + abb["FM"] = "Federated States of Micronesia" + + #insert vaccination data for country and state + for index, row in us_sv.iterrows(): + date1 = row['Date'] + c.execute('SELECT * FROM Vaccinations_Per_Country WHERE country_code ="' + us_code + '" AND date_collected ="' + str(date1)+ '"') + result = c.fetchall() + if len(result) == 0: + if row["date_type"] == "Report": + if row["Location"] == "US": + sql = '''INSERT INTO Vaccinations_Per_Country (date_collected, first_vaccination_number, second_vaccination_number, third_vaccination_number, country_code, source_id) VALUES (?, ?, ?, ?, ?, ?)''' + c.execute(sql,(row["Date"], row["Admin_Dose_1_Cumulative"], row["Series_Complete_Cumulative"], row["Booster_Cumulative"], us_code, us_src_v)) + else: + sql = '''INSERT INTO Vaccinations_Per_Region (date_collected, first_vaccination_number, second_vaccination_number, third_vaccination_number, region_code, source_id) VALUES (?, ?, ?, ?, ?, ?)''' + c.execute(sql,(row["Date"], row["Admin_Dose_1_Cumulative"], row["Series_Complete_Cumulative"], row["Booster_Cumulative"], region_dict[abb[row["Location"]]], us_src_v)) + else: + break + conn.commit() \ No newline at end of file diff --git a/initial_data_scripts/init_north_america.py b/initial_data_scripts/init_north_america.py index f01a745..49f67f9 100644 --- a/initial_data_scripts/init_north_america.py +++ b/initial_data_scripts/init_north_america.py @@ -1 +1,121 @@ -# JHU data for US states is handled in init_global.py \ No newline at end of file +# JHU data for US states is handled in init_global.py +import pandas as pd +import sqlite3 +import sys +import datetime +from datetime import date +import requests +from datetime import datetime + +sys.path.append("..") +from util import * + +#install html parse tool +from urllib.request import urlopen +!pip install beautifulsoup4 +from bs4 import BeautifulSoup +from urllib.request import urlopen + +#add country and county level case data and vaccination data for country and state +def init_us(): + conn = sqlite3.connect('prototype_db') + c = conn.cursor() + + + # get country_code for US + us_code = get_country_code("United States", c) + + #insert and get source id for US data + us_src_url = "https://github.com/nytimes/covid-19-data" + set_source(us_src_url, c, conn) + us_src = get_source_id(us_src_url, c) + us_src_v = "https://covid.cdc.gov/covid-data-tracker/#datatracker-home" + set_source(us_src_v, c, conn) + us_src_v = get_source_id(us_src_v, c) + + us_country = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv") + #us_state = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-states.csv") + #just use recent data for counties otherwise too large(can change later) + us_county = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-recent.csv") + us_sv = pd.read_csv("https://data.cdc.gov/api/views/rh2h-3yt2/rows.csv") + + #insert data for US + for index, row in us_country.iterrows(): + sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers) VALUES (?, ?, ?, ?, ?)''' + c.execute(sql,(us_code, row["date"], us_src, row["deaths"], row["cases"])) + conn.commit() + + region_dict = {} + #get state code for US + c.execute("SELECT region_code, region_name from Regions Where country_code = 'US'") + result = c.fetchall() + + for i in range(0,len(result)): + region_dict[result[i][1]] = result[i][0] + + #insert county code and data + county_dict = {} + for index, row in us_county.iterrows(): + state = row["state"] + county = row["county"] + if state not in region_dict: + sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)''' + c.execute(sql,(state, us_code)) + region_dict[state] = get_region_code(us_code, state, c) + if county not in county_dict: + sql = '''INSERT INTO Districts (district_name, region_code) VALUES (?, ?)''' + c.execute(sql,(county, region_dict[state])) + county_dict[county] = get_district_code(region_dict[state], county, c) + sql = '''INSERT INTO Cases_Per_District (district_code, date_collected, source_id, death_numbers, case_numbers) VALUES (?, ?, ?, ?, ?)''' + c.execute(sql,(county_dict[county], row["date"], us_src, row["deaths"], row["cases"])) + conn.commit() + + #get and insert population data + abb = {} + wikiurl="https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#cite_note-:0-18" + table_class="wikitable sortable jquery-tablesorter" + response=requests.get(wikiurl) + soup = BeautifulSoup(response.text, 'html.parser') + table = soup.find_all('table',{'class':"wikitable"}) + us_abb = pd.read_html(str(table)) + for i in range(0, 3): + us_abb1 = pd.DataFrame(us_abb[i]) + for index, row in us_abb1.iterrows(): + state = row[0] + state = state.replace("[D]", "") + state = state.replace("U.S. ", "") + abb[row[1]] = state + if state not in region_dict: + sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)''' + c.execute(sql,(state, us_code)) + region_dict[state] = get_region_code(us_code, state, c) + sql = '''INSERT INTO Population_Per_Region (region_code, population_amount, date_collected) VALUES (?, ?, ?)''' + c.execute(sql,(region_dict[state], row[5], datetime(2020, 4, 1).date())) + sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)''' + c.execute(sql,("Palau", us_code)) + region_dict["Palau"] = get_region_code(us_code, "Palau", c) + sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)''' + c.execute(sql,("Marshall Islands", us_code)) + region_dict["Marshall Islands"] = get_region_code(us_code, "Marshall Islands", c) + sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)''' + c.execute(sql,("Federated States of Micronesia", us_code)) + region_dict["Federated States of Micronesia"] = get_region_code(us_code, "Federated States of Micronesia", c) + abb["PW"] = "Palau" + abb["MH"] = "Marshall Islands" + abb["FM"] = "Federated States of Micronesia" + conn.commit() + + sql = '''INSERT INTO Population_Per_Country (country_code, population_amount, date_collected) VALUES (?, ?, ?)''' + c.execute(sql,(us_code, 334735155, datetime(2020, 4, 1).date())) + conn.commit() + + #insert vaccination data for country and state + for index, row in us_sv.iterrows(): + if row["date_type"] == "Report": + if row["Location"] == "US": + sql = '''INSERT INTO Vaccinations_Per_Country (date_collected, first_vaccination_number, second_vaccination_number, third_vaccination_number, country_code, source_id) VALUES (?, ?, ?, ?, ?, ?)''' + c.execute(sql,(row["Date"], row["Admin_Dose_1_Cumulative"], row["Series_Complete_Cumulative"], row["Booster_Cumulative"], us_code, us_src_v)) + else: + sql = '''INSERT INTO Vaccinations_Per_Region (date_collected, first_vaccination_number, second_vaccination_number, third_vaccination_number, region_code, source_id) VALUES (?, ?, ?, ?, ?, ?)''' + c.execute(sql,(row["Date"], row["Admin_Dose_1_Cumulative"], row["Series_Complete_Cumulative"], row["Booster_Cumulative"], region_dict[abb[row["Location"]]], us_src_v)) + conn.commit() \ No newline at end of file diff --git a/prototype_main_backend.py b/prototype_main_backend.py index 850fb73..21bd128 100644 --- a/prototype_main_backend.py +++ b/prototype_main_backend.py @@ -211,5 +211,5 @@ countries.to_sql('Countries',con=conn, if_exists = 'append', index=False) c.close() from initial_data_scripts.init_europe import init_italy, init_ukraine -from initial_data_scripts.init_asia import init_japan, init_korea +from initial_data_scripts.init_asia import init_japan, init_korea, init_ina from initial_data_scripts.init_global import init_jhu diff --git a/util.py b/util.py index bb46fb8..53d6bd8 100644 --- a/util.py +++ b/util.py @@ -13,7 +13,7 @@ def get_region_code(country_code, region_name, c): # TODO Why does this insist on casting region_code to str? # Get district code associated with district_name and region_code from Districts table. def get_district_code(region_code, district_name, c): - c.execute('SELECT district_code FROM Districts WHERE region_code = "' + str(region_code) + '" AND district_name = "' + district_name + '"') + c.execute('SELECT district_code FROM Districts WHERE region_code = ' + str(region_code) + 'AND district_name = "' + district_name + '"') result = c.fetchall() return result[0][0] if result != [] else None -- GitLab