From 3360c8de84aeccadb9ad839020f9f7ca9d999cef Mon Sep 17 00:00:00 2001
From: "Fanchong(Ivy) Wang" <fw29@cs.washington.edu>
Date: Thu, 3 Mar 2022 02:28:02 -0800
Subject: [PATCH] add init and update for us(country and county level case data
 and  country and state level vaccination data

---
 daily_data_scripts/daily_na.py             | 112 +++++++++++++++++++
 initial_data_scripts/init_north_america.py | 122 ++++++++++++++++++++-
 prototype_main_backend.py                  |   2 +-
 util.py                                    |   2 +-
 4 files changed, 235 insertions(+), 3 deletions(-)
 create mode 100644 daily_data_scripts/daily_na.py

diff --git a/daily_data_scripts/daily_na.py b/daily_data_scripts/daily_na.py
new file mode 100644
index 0000000..eea73c2
--- /dev/null
+++ b/daily_data_scripts/daily_na.py
@@ -0,0 +1,112 @@
+import pandas as pd
+import sqlite3
+import sys
+import datetime
+import requests
+from datetime import date
+from datetime import datetime 
+
+sys.path.append("..")
+from util import *
+
+#install html parse tool
+from urllib.request import urlopen
+!pip install beautifulsoup4
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
+
+#update country and county level case data and vaccination data for country and state
+def update_us():
+    conn = sqlite3.connect('prototype_db')
+    c = conn.cursor()
+    
+    
+    # get country_code for US
+    us_code = get_country_code("United States", c)
+    
+    #get source id for US data
+    us_src_url = "https://github.com/nytimes/covid-19-data"
+    us_src = get_source_id(us_src_url, c)
+    us_src_v = "https://covid.cdc.gov/covid-data-tracker/#datatracker-home"
+    us_src_v = get_source_id(us_src_v, c)
+    
+    us_country = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv")
+    #us_state = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-states.csv")
+    #just use recent data for counties otherwise too large(can change later)
+    us_county = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-recent.csv")
+    us_sv = pd.read_csv("https://data.cdc.gov/api/views/rh2h-3yt2/rows.csv")
+    
+    #insert data for US
+    us_country = us_country[::-1]
+    for index, row in us_country.iterrows():
+        date1 = row['date']
+        c.execute('SELECT * FROM Cases_Per_Country WHERE country_code ="' + us_code + '" AND date_collected ="' + str(date1)+ '"')
+        result = c.fetchall()
+        if len(result) == 0:
+            sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers) VALUES (?, ?, ?, ?, ?)'''
+            c.execute(sql,(us_code, row["date"], us_src, row["deaths"], row["cases"]))
+        else:
+            break
+    conn.commit()
+    
+    region_dict = {}
+    #get state code for US
+    c.execute("SELECT region_code, region_name from Regions Where country_code = 'US'")
+    result = c.fetchall()
+
+    for i in range(0,len(result)):
+        region_dict[result[i][1]] = result[i][0]
+    
+    #insert county code and data
+    county_dict = {}
+    us_county = us_county[::-1]
+    for index, row in us_county.iterrows():
+        state = row["state"]
+        county = row["county"]
+        print(region_dict[state], county)
+        county_code = get_district_code(region_dict[state], county, c)
+        date1 = row['date']
+        c.execute('SELECT * FROM Cases_Per_District WHERE district_code' + str(county_code) + 'AND date_collected ="' + str(date1)+ '"')
+        result = c.fetchall()
+        if len(result) == 0:
+            sql = '''INSERT INTO Cases_Per_District (district_code, date_collected, source_id, death_numbers, case_numbers) VALUES (?, ?, ?, ?, ?)'''
+            c.execute(sql,(county_code, row["date"], us_src, row["deaths"], row["cases"]))
+        else:
+            break
+    conn.commit()
+    
+    #get abb 
+    abb = {}
+    wikiurl="https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#cite_note-:0-18"
+    table_class="wikitable sortable jquery-tablesorter"
+    response=requests.get(wikiurl)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    table = soup.find_all('table',{'class':"wikitable"})
+    us_abb = pd.read_html(str(table))
+    for i in range(0, 3):
+        us_abb1 = pd.DataFrame(us_abb[i])
+        for index, row in us_abb1.iterrows():
+            state = row[0]
+            state = state.replace("[D]", "")
+            state = state.replace("U.S. ", "")
+            abb[row[1]] = state
+    abb["PW"] = "Palau"
+    abb["MH"] = "Marshall Islands"
+    abb["FM"] = "Federated States of Micronesia"
+    
+    #insert vaccination data for country and state
+    for index, row in us_sv.iterrows():
+        date1 = row['Date']
+        c.execute('SELECT * FROM Vaccinations_Per_Country WHERE country_code ="' + us_code + '" AND date_collected ="' + str(date1)+ '"')
+        result = c.fetchall()
+        if len(result) == 0:
+            if row["date_type"] == "Report":
+                if row["Location"] == "US":
+                    sql = '''INSERT INTO Vaccinations_Per_Country (date_collected, first_vaccination_number, second_vaccination_number,  third_vaccination_number, country_code, source_id) VALUES (?, ?, ?, ?, ?, ?)'''
+                    c.execute(sql,(row["Date"], row["Admin_Dose_1_Cumulative"], row["Series_Complete_Cumulative"], row["Booster_Cumulative"], us_code, us_src_v))
+                else:
+                    sql = '''INSERT INTO Vaccinations_Per_Region (date_collected, first_vaccination_number, second_vaccination_number,  third_vaccination_number, region_code, source_id) VALUES (?, ?, ?, ?, ?, ?)'''
+                    c.execute(sql,(row["Date"], row["Admin_Dose_1_Cumulative"], row["Series_Complete_Cumulative"], row["Booster_Cumulative"], region_dict[abb[row["Location"]]], us_src_v))
+        else:
+            break
+    conn.commit()
\ No newline at end of file
diff --git a/initial_data_scripts/init_north_america.py b/initial_data_scripts/init_north_america.py
index f01a745..49f67f9 100644
--- a/initial_data_scripts/init_north_america.py
+++ b/initial_data_scripts/init_north_america.py
@@ -1 +1,121 @@
-# JHU data for US states is handled in init_global.py
\ No newline at end of file
+# JHU data for US states is handled in init_global.py
+import pandas as pd
+import sqlite3
+import sys
+import datetime
+from datetime import date
+import requests
+from datetime import datetime   
+
+sys.path.append("..")
+from util import *
+
+#install html parse tool
+from urllib.request import urlopen
+!pip install beautifulsoup4
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
+
+#add country and county level case data and vaccination data for country and state
+def init_us():
+    conn = sqlite3.connect('prototype_db')
+    c = conn.cursor()
+    
+    
+    # get country_code for US
+    us_code = get_country_code("United States", c)
+    
+    #insert and get source id for US data
+    us_src_url = "https://github.com/nytimes/covid-19-data"
+    set_source(us_src_url, c, conn)
+    us_src = get_source_id(us_src_url, c)
+    us_src_v = "https://covid.cdc.gov/covid-data-tracker/#datatracker-home"
+    set_source(us_src_v, c, conn)
+    us_src_v = get_source_id(us_src_v, c)
+    
+    us_country = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv")
+    #us_state = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-states.csv")
+    #just use recent data for counties otherwise too large(can change later)
+    us_county = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-recent.csv")
+    us_sv = pd.read_csv("https://data.cdc.gov/api/views/rh2h-3yt2/rows.csv")
+    
+    #insert data for US
+    for index, row in us_country.iterrows():
+        sql = '''INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers) VALUES (?, ?, ?, ?, ?)'''
+        c.execute(sql,(us_code, row["date"], us_src, row["deaths"], row["cases"]))
+    conn.commit()
+    
+    region_dict = {}
+    #get state code for US
+    c.execute("SELECT region_code, region_name from Regions Where country_code = 'US'")
+    result = c.fetchall()
+
+    for i in range(0,len(result)):
+        region_dict[result[i][1]] = result[i][0]
+    
+    #insert county code and data
+    county_dict = {}
+    for index, row in us_county.iterrows():
+        state = row["state"]
+        county = row["county"]
+        if state not in region_dict:
+            sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)'''
+            c.execute(sql,(state, us_code))
+            region_dict[state] = get_region_code(us_code, state, c)
+        if county not in county_dict:
+            sql = '''INSERT INTO Districts (district_name, region_code) VALUES (?, ?)'''
+            c.execute(sql,(county, region_dict[state]))
+            county_dict[county] = get_district_code(region_dict[state], county, c)
+        sql = '''INSERT INTO Cases_Per_District (district_code, date_collected, source_id, death_numbers, case_numbers) VALUES (?, ?, ?, ?, ?)'''
+        c.execute(sql,(county_dict[county], row["date"], us_src, row["deaths"], row["cases"]))
+    conn.commit()
+    
+    #get and insert population data
+    abb = {}
+    wikiurl="https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#cite_note-:0-18"
+    table_class="wikitable sortable jquery-tablesorter"
+    response=requests.get(wikiurl)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    table = soup.find_all('table',{'class':"wikitable"})
+    us_abb = pd.read_html(str(table))
+    for i in range(0, 3):
+        us_abb1 = pd.DataFrame(us_abb[i])
+        for index, row in us_abb1.iterrows():
+            state = row[0]
+            state = state.replace("[D]", "")
+            state = state.replace("U.S. ", "")
+            abb[row[1]] = state
+            if state not in region_dict:
+                sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)'''
+                c.execute(sql,(state, us_code))
+                region_dict[state] = get_region_code(us_code, state, c)
+            sql = '''INSERT INTO Population_Per_Region (region_code, population_amount, date_collected) VALUES (?, ?, ?)'''
+            c.execute(sql,(region_dict[state], row[5], datetime(2020, 4, 1).date()))
+    sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)'''
+    c.execute(sql,("Palau", us_code))
+    region_dict["Palau"] = get_region_code(us_code, "Palau", c)
+    sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)'''
+    c.execute(sql,("Marshall Islands", us_code))
+    region_dict["Marshall Islands"] = get_region_code(us_code, "Marshall Islands", c)
+    sql = '''INSERT INTO Regions (region_name, country_code) VALUES (?, ?)'''
+    c.execute(sql,("Federated States of Micronesia", us_code))
+    region_dict["Federated States of Micronesia"] = get_region_code(us_code, "Federated States of Micronesia", c)
+    abb["PW"] = "Palau"
+    abb["MH"] = "Marshall Islands"
+    abb["FM"] = "Federated States of Micronesia"
+    conn.commit()
+    
+    sql = '''INSERT INTO Population_Per_Country (country_code, population_amount, date_collected) VALUES (?, ?, ?)'''
+    c.execute(sql,(us_code, 334735155, datetime(2020, 4, 1).date()))   
+    conn.commit()
+    
+    #insert vaccination data for country and state
+    for index, row in us_sv.iterrows():
+        if row["date_type"] == "Report":
+            if row["Location"] == "US":
+                sql = '''INSERT INTO Vaccinations_Per_Country (date_collected, first_vaccination_number, second_vaccination_number,  third_vaccination_number, country_code, source_id) VALUES (?, ?, ?, ?, ?, ?)'''
+                c.execute(sql,(row["Date"], row["Admin_Dose_1_Cumulative"], row["Series_Complete_Cumulative"], row["Booster_Cumulative"], us_code, us_src_v))
+            else:
+                sql = '''INSERT INTO Vaccinations_Per_Region (date_collected, first_vaccination_number, second_vaccination_number,  third_vaccination_number, region_code, source_id) VALUES (?, ?, ?, ?, ?, ?)'''
+                c.execute(sql,(row["Date"], row["Admin_Dose_1_Cumulative"], row["Series_Complete_Cumulative"], row["Booster_Cumulative"], region_dict[abb[row["Location"]]], us_src_v))
+    conn.commit()
\ No newline at end of file
diff --git a/prototype_main_backend.py b/prototype_main_backend.py
index 850fb73..21bd128 100644
--- a/prototype_main_backend.py
+++ b/prototype_main_backend.py
@@ -211,5 +211,5 @@ countries.to_sql('Countries',con=conn, if_exists = 'append', index=False)
 
 c.close()
 from initial_data_scripts.init_europe import init_italy, init_ukraine
-from initial_data_scripts.init_asia import init_japan, init_korea
+from initial_data_scripts.init_asia import init_japan, init_korea, init_ina
 from initial_data_scripts.init_global import init_jhu
diff --git a/util.py b/util.py
index bb46fb8..53d6bd8 100644
--- a/util.py
+++ b/util.py
@@ -13,7 +13,7 @@ def get_region_code(country_code, region_name, c):
 # TODO Why does this insist on casting region_code to str?
 # Get district code associated with district_name and region_code from Districts table.
 def get_district_code(region_code, district_name, c):
-  c.execute('SELECT district_code FROM Districts WHERE region_code = "' + str(region_code) + '" AND district_name = "' + district_name + '"')
+  c.execute('SELECT district_code FROM Districts WHERE region_code = ' + str(region_code) + 'AND district_name = "' + district_name + '"')
   result = c.fetchall()
   return result[0][0] if result != [] else None
 
-- 
GitLab