Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
COVID-19-Data-Integration
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ivy Wang
COVID-19-Data-Integration
Commits
005db1e8
Commit
005db1e8
authored
3 years ago
by
Arshana Jain
Browse files
Options
Downloads
Patches
Plain Diff
rough draft jhu daily
parent
777a39fc
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
daily_data_scripts/daily_global.py
+289
-0
289 additions, 0 deletions
daily_data_scripts/daily_global.py
initial_data_scripts/init_global.py
+18
-3
18 additions, 3 deletions
initial_data_scripts/init_global.py
with
307 additions
and
3 deletions
daily_data_scripts/daily_global.py
+
289
−
0
View file @
005db1e8
import
pandas
as
pd
import
sqlite3
import
datetime
import
sys
import
json
from
util
import
*
def
daily_jhu
():
conn
=
sqlite3
.
connect
(
'
prototype_db
'
)
c
=
conn
.
cursor
()
# insert and get source id for source
src_url
=
"
https://github.com/CSSEGISandData/COVID-19
"
set_source
(
src_url
,
c
,
conn
)
src_id
=
get_source_id
(
src_url
,
c
)
daily_jhu_us_states
(
c
,
conn
,
src_id
)
#daily_jhu_global(c, conn, src_id)
conn
.
close
()
# US States from JHU data
# ONLY SAFE TO CALL FROM daily_jhu in this state
def
daily_jhu_us_states
(
c
,
conn
,
src_id
):
i
=
0
prev_death_dict
=
{}
prev_recovered_dict
=
{}
prev_case_dict
=
{}
prev_hospitalized_dict
=
{}
with
open
(
'
jhu_us.json
'
,
'
r
'
)
as
f
:
for
line
in
f
:
if
i
==
0
:
prev_death_dict
=
json
.
loads
(
line
)
elif
i
==
1
:
prev_recovered_dict
=
json
.
loads
(
line
)
elif
i
==
2
:
prev_case_dict
=
json
.
loads
(
line
)
elif
i
==
3
:
prev_hospitalized_dict
=
json
.
loads
(
line
)
i
+=
1
f
.
close
()
# get country_code
us_code
=
get_country_code
(
"
United States
"
,
c
)
# insert state data in Cases_per_Region
# the data is cumulative - need the previous data to accurately update the new data
dt
=
datetime
.
datetime
.
today
()
-
datetime
.
timedelta
(
days
=
2
)
last_error
=
""
for
i
in
range
(
0
,
3
):
date
=
(
'
0
'
if
dt
.
month
<
10
else
''
)
+
str
(
dt
.
month
)
+
'
-
'
+
(
'
0
'
if
dt
.
day
<
10
else
''
)
+
str
(
dt
.
day
)
+
'
-
'
+
str
(
dt
.
year
)
sql
=
'''
SELECT date_collected FROM Cases_Per_Region WHERE date_collected = ? AND source_id = ?
'''
c
.
execute
(
sql
,
(
date
,
src_id
))
already_entered
=
c
.
fetchall
()
==
[]
if
not
already_entered
:
csv_name
=
'
https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/
'
+
date
+
'
.csv
'
try
:
df
=
pd
.
read_csv
(
csv_name
,
error_bad_lines
=
False
)
for
row
in
df
.
itertuples
():
region_code
=
get_region_code
(
us_code
,
row
.
Province_State
,
c
)
prev_death
=
0
if
region_code
not
in
prev_death_dict
else
prev_death_dict
[
region_code
]
prev_recovered
=
0
if
region_code
not
in
prev_recovered_dict
else
prev_recovered_dict
[
region_code
]
prev_case
=
0
if
region_code
not
in
prev_case_dict
else
prev_case_dict
[
region_code
]
prev_hospitalized
=
0
if
region_code
not
in
prev_hospitalized_dict
else
prev_hospitalized_dict
[
region_code
]
if
region_code
is
not
None
:
sql
=
'''
INSERT INTO Cases_Per_Region (region_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers, hospitalization_numbers) VALUES (?, ?, ?, ?, ?, ?, ?)
'''
# handles the case of a blank column by inserting None
c
.
execute
(
sql
,(
region_code
,
date
,
src_id
,
row
.
Deaths
-
prev_death
if
row
.
Deaths
is
not
None
else
None
,
row
.
Confirmed
-
prev_case
if
row
.
Confirmed
is
not
None
else
None
,
row
.
Recovered
-
prev_recovered
if
row
.
Recovered
is
not
None
else
None
,
row
.
People_Hospitalized
-
prev_hospitalized
if
row
.
People_Hospitalized
is
not
None
else
None
))
# update previous
if
row
.
Deaths
is
not
None
:
prev_death_dict
[
region_code
]
=
row
.
Deaths
if
row
.
Recovered
is
not
None
:
prev_recovered_dict
[
region_code
]
=
row
.
Recovered
if
row
.
Confirmed
is
not
None
:
prev_case_dict
[
region_code
]
=
row
.
Confirmed
if
row
.
People_Hospitalized
is
not
None
:
prev_hospitalized_dict
[
region_code
]
=
row
.
People_Hospitalized
else
:
last_error
=
(
row
.
Province_State
+
"
was missing from the Regions table - init_jhu_us_states
"
+
csv_name
+
"
.
"
)
conn
.
commit
()
except
:
break
dt
+=
datetime
.
timedelta
(
days
=
1
)
print
(
last_error
)
with
open
(
'
jhu_us.json
'
,
'
w
'
)
as
f
:
f
.
write
(
json
.
dumps
(
prev_death_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_recovered_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_case_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_hospitalized_dict
)
+
'
\n
'
)
f
.
close
()
# Global JHU data
# ONLY SAFE TO CALL FROM init_jhu in this state (otherwise consider that source may be replicated, etc.)
# First csv: 01-22-2020
def
init_jhu_global
(
c
,
conn
,
src_id
):
missing_countries_set
=
set
(())
# used to keep track of any countries that might need to be added to the countries table - for debugging purposes
# can be used for country and region codes since they are unique from each other
prev_death_dict
=
{}
prev_recovered_dict
=
{}
prev_case_dict
=
{}
# for subregion since codes overlap with region
prev_death_dict_subregion
=
{}
prev_recovered_dict_subregion
=
{}
prev_case_dict_subregion
=
{}
with
open
(
'
jhu_global.json
'
,
'
r
'
)
as
f
:
for
line
in
f
:
if
i
==
0
:
prev_death_dict
=
json
.
loads
(
line
)
elif
i
==
1
:
prev_recovered_dict
=
json
.
loads
(
line
)
elif
i
==
2
:
prev_case_dict
=
json
.
loads
(
line
)
elif
i
==
3
:
prev_death_dict_subregion
=
json
.
loads
(
line
)
elif
i
==
4
:
prev_recovered_dict_subregion
=
json
.
loads
(
line
)
elif
i
==
5
:
prev_case_dict_subregion
=
json
.
loads
(
line
)
i
+=
1
f
.
close
()
# TODO test again after the Namibia issue from prototype_main_backend is fixed
dt
=
datetime
.
datetime
.
today
()
-
datetime
.
timedelta
(
days
=
2
)
for
i
in
range
(
0
,
3
):
date
=
(
'
0
'
if
dt
.
month
<
10
else
''
)
+
str
(
dt
.
month
)
+
'
-
'
+
(
'
0
'
if
dt
.
day
<
10
else
''
)
+
str
(
dt
.
day
)
+
'
-
'
+
str
(
dt
.
year
)
sql
=
'''
SELECT date_collected FROM Cases_Per_Region WHERE date_collected = ? AND source_id = ?
'''
c
.
execute
(
sql
,
(
date
,
src_id
))
already_entered
=
c
.
fetchall
()
==
[]
if
not
already_entered
:
csv_name
=
'
https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/
'
+
date
+
'
.csv
'
try
:
df
=
pd
.
read_csv
(
csv_name
,
error_bad_lines
=
False
)
for
row
in
df
.
itertuples
():
# normalize country name
country_name
=
None
if
"
Country_Region
"
in
df
.
columns
:
country_name
=
str
(
row
.
Country_Region
).
strip
()
else
:
country_name
=
str
(
row
.
_2
).
strip
()
if
(
country_name
==
"
Burma
"
):
country_name
=
"
Myanmar
"
elif
(
country_name
==
"
Czechia
"
):
country_name
=
"
Czech Republic
"
elif
(
country_name
==
"
Taiwan*
"
):
country_name
=
"
Taiwan
"
elif
(
country_name
==
"
Korea, South
"
):
country_name
=
"
South Korea
"
elif
(
country_name
==
"
US
"
):
country_name
=
"
United States
"
elif
(
country_name
==
"
Congo (Brazzaville)
"
or
country_name
==
"
Republic of the Congo
"
):
country_name
=
"
Congo-Brazzaville
"
elif
(
country_name
==
"
Congo (Kinshasa)
"
):
country_name
=
"
Congo-Kinshasa
"
elif
(
country_name
==
"
Mainland China
"
):
country_name
=
"
China
"
elif
(
country_name
==
"
Macau
"
or
country_name
==
"
Macao SAR
"
):
country_name
=
"
Macao
"
elif
(
country_name
==
"
Bahamas, The
"
or
country_name
==
"
The Bahamas
"
):
country_name
=
"
Bahamas
"
elif
(
country_name
==
"
Republic of Korea
"
):
country_name
=
"
South Korea
"
elif
(
country_name
==
"
The Gambia
"
or
country_name
==
"
Gambia, The
"
):
country_name
=
"
Gambia
"
elif
(
country_name
==
"
Ivory Coast
"
):
country_name
=
"
Cote d
'
Ivoire
"
elif
(
country_name
==
"
Hong Kong SAR
"
):
country_name
=
"
Hong Kong
"
elif
(
country_name
==
"
Republic of Ireland
"
):
country_name
=
"
Ireland
"
elif
(
country_name
==
"
East Timor
"
):
country_name
=
"
Timor-Leste
"
elif
(
country_name
==
"
Russian Federation
"
):
country_name
=
"
Russia
"
elif
(
country_name
==
"
Republic of Moldova
"
):
country_name
=
"
Moldova
"
elif
(
country_name
==
"
Iran (Islamic Republic of)
"
):
country_name
=
"
Iran
"
elif
(
country_name
==
"
Viet Nam
"
):
country_name
=
"
Vietnam
"
elif
(
country_name
==
"
Cape Verde
"
):
country_name
=
"
Cabo Verde
"
elif
(
country_name
==
"
Vatican City
"
):
country_name
=
"
Holy See
"
elif
(
country_name
==
"
UK
"
):
country_name
=
"
United Kingdom
"
country_code
=
get_country_code
(
country_name
,
c
)
if
country_code
is
None
:
missing_countries_set
.
add
(
country_name
)
else
:
region_name
=
None
if
"
Province_State
"
in
df
.
columns
:
region_name
=
str
(
row
.
Province_State
).
strip
()
else
:
region_name
=
str
(
row
.
_1
).
strip
()
if
(
region_name
is
None
or
str
(
region_name
).
lower
()
==
"
nan
"
):
# a country-level entry
prev_death
=
0
if
country_code
not
in
prev_death_dict
else
prev_death_dict
[
country_code
]
prev_recovered
=
0
if
country_code
not
in
prev_recovered_dict
else
prev_recovered_dict
[
country_code
]
prev_case
=
0
if
country_code
not
in
prev_case_dict
else
prev_case_dict
[
country_code
]
sql
=
'''
INSERT INTO Cases_Per_Country (country_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)
'''
# handles the case of a blank column by inserting None
c
.
execute
(
sql
,(
country_code
,
date
,
src_id
,
row
.
Deaths
-
prev_death
if
row
.
Deaths
is
not
None
else
None
,
row
.
Confirmed
-
prev_case
if
row
.
Confirmed
is
not
None
else
None
,
row
.
Recovered
-
prev_recovered
if
row
.
Recovered
is
not
None
else
None
))
# update previous
if
row
.
Deaths
is
not
None
:
prev_death_dict
[
country_code
]
=
row
.
Deaths
if
row
.
Recovered
is
not
None
:
prev_recovered_dict
[
country_code
]
=
row
.
Recovered
if
row
.
Confirmed
is
not
None
:
prev_case_dict
[
country_code
]
=
row
.
Confirmed
elif
(
region_name
!=
"
Recovered
"
and
region_name
!=
"
Unknown
"
):
# a region-level entry
# skip Recovered row - irrelevant data - be on the look out for other special cases that haven't been noticed yet
region_code
=
get_region_code
(
str
(
country_code
),
str
(
region_name
),
c
)
if
region_code
is
None
:
sql
=
'''
INSERT INTO Regions (region_name, country_code, longitude, latitude) VALUES (?, ?, ?, ?)
'''
c
.
execute
(
sql
,(
str
(
region_name
),
country_code
,
row
.
Long_
if
'
Long_
'
in
df
.
columns
else
None
,
row
.
Lat
if
'
Lat
'
in
df
.
columns
else
None
))
conn
.
commit
()
region_code
=
get_region_code
(
str
(
country_code
),
str
(
region_name
),
c
)
subregion_name
=
None
if
"
Admin2
"
in
df
.
columns
:
subregion_name
=
str
(
row
.
Admin2
).
strip
()
if
(
subregion_name
is
None
or
str
(
subregion_name
).
lower
()
==
"
nan
"
):
prev_death
=
0
if
region_code
not
in
prev_death_dict
else
prev_death_dict
[
region_code
]
prev_recovered
=
0
if
region_code
not
in
prev_recovered_dict
else
prev_recovered_dict
[
region_code
]
prev_case
=
0
if
region_code
not
in
prev_case_dict
else
prev_case_dict
[
region_code
]
sql
=
'''
INSERT INTO Cases_Per_Region (region_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)
'''
# handles the case of a blank column by inserting None
c
.
execute
(
sql
,(
region_code
,
date
,
src_id
,
row
.
Deaths
-
prev_death
if
row
.
Deaths
is
not
None
else
None
,
row
.
Confirmed
-
prev_case
if
row
.
Confirmed
is
not
None
else
None
,
row
.
Recovered
-
prev_recovered
if
row
.
Recovered
is
not
None
else
None
))
# update previous
if
row
.
Deaths
is
not
None
:
prev_death_dict
[
region_code
]
=
row
.
Deaths
if
row
.
Recovered
is
not
None
:
prev_recovered_dict
[
region_code
]
=
row
.
Recovered
if
row
.
Confirmed
is
not
None
:
prev_case_dict
[
region_code
]
=
row
.
Confirmed
elif
(
subregion_name
!=
"
Unassigned
"
):
subregion_code
=
get_district_code
(
region_code
,
str
(
subregion_name
),
c
)
if
subregion_code
is
None
:
sql
=
'''
INSERT INTO Districts (district_name, region_code, longitude, latitude) VALUES (?, ?, ?, ?)
'''
c
.
execute
(
sql
,(
str
(
subregion_name
),
region_code
,
row
.
Long_
if
'
Long_
'
in
df
.
columns
else
None
,
row
.
Lat
if
'
Lat
'
in
df
.
columns
else
None
))
conn
.
commit
()
subregion_code
=
get_district_code
(
region_code
,
str
(
subregion_name
),
c
)
prev_death
=
0
if
subregion_code
not
in
prev_death_dict_subregion
else
prev_death_dict_subregion
[
subregion_code
]
prev_recovered
=
0
if
subregion_code
not
in
prev_recovered_dict_subregion
else
prev_recovered_dict_subregion
[
subregion_code
]
prev_case
=
0
if
subregion_code
not
in
prev_case_dict_subregion
else
prev_case_dict_subregion
[
subregion_code
]
sql
=
'''
INSERT INTO Cases_Per_District (district_code, date_collected, source_id, death_numbers, case_numbers, recovery_numbers) VALUES (?, ?, ?, ?, ?, ?)
'''
# handles the case of a blank column by inserting None
c
.
execute
(
sql
,(
subregion_code
,
date
,
src_id
,
row
.
Deaths
-
prev_death
if
row
.
Deaths
is
not
None
else
None
,
row
.
Confirmed
-
prev_case
if
row
.
Confirmed
is
not
None
else
None
,
row
.
Recovered
-
prev_recovered
if
row
.
Recovered
is
not
None
else
None
))
# update previous
if
row
.
Deaths
is
not
None
:
prev_death_dict_subregion
[
subregion_code
]
=
row
.
Deaths
if
row
.
Recovered
is
not
None
:
prev_recovered_dict_subregion
[
subregion_code
]
=
row
.
Recovered
if
row
.
Confirmed
is
not
None
:
prev_case_dict_subregion
[
subregion_code
]
=
row
.
Confirmed
conn
.
commit
()
# runs after every csv
except
Exception
as
e
:
exc_type
,
exc_obj
,
exc_tb
=
sys
.
exc_info
()
# debugging
#print(exc_tb.tb_lineno)
#print(e)
break
dt
+=
datetime
.
timedelta
(
days
=
1
)
# debugging
#print(missing_countries_set)
with
open
(
'
jhu_global.json
'
,
'
w
'
)
as
f
:
f
.
write
(
json
.
dumps
(
prev_death_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_recovered_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_case_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_death_dict_subregion
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_recovered_dict_subregion
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_case_dict_subregion
)
+
'
\n
'
)
f
.
close
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
initial_data_scripts/init_global.py
+
18
−
3
View file @
005db1e8
...
@@ -2,6 +2,7 @@ import pandas as pd
...
@@ -2,6 +2,7 @@ import pandas as pd
import
sqlite3
import
sqlite3
import
datetime
import
datetime
import
sys
import
sys
import
json
from
util
import
*
from
util
import
*
...
@@ -78,6 +79,13 @@ def init_jhu_us_states(c, conn, src_id):
...
@@ -78,6 +79,13 @@ def init_jhu_us_states(c, conn, src_id):
print
(
last_error
)
print
(
last_error
)
with
open
(
'
jhu_us.json
'
,
'
w
'
)
as
f
:
f
.
write
(
json
.
dumps
(
prev_death_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_recovered_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_case_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_hospitalized_dict
)
+
'
\n
'
)
f
.
close
()
# Global JHU data
# Global JHU data
# ONLY SAFE TO CALL FROM init_jhu in this state (otherwise consider that source may be replicated, etc.)
# ONLY SAFE TO CALL FROM init_jhu in this state (otherwise consider that source may be replicated, etc.)
# First csv: 01-22-2020
# First csv: 01-22-2020
...
@@ -96,8 +104,7 @@ def init_jhu_global(c, conn, src_id):
...
@@ -96,8 +104,7 @@ def init_jhu_global(c, conn, src_id):
prev_case_dict_subregion
=
{}
prev_case_dict_subregion
=
{}
# TODO test again after the Namibia issue from prototype_main_backend is fixed
# TODO test again after the Namibia issue from prototype_main_backend is fixed
i
=
0
while
(
True
):
while
(
i
<
3
):
date
=
(
'
0
'
if
dt
.
month
<
10
else
''
)
+
str
(
dt
.
month
)
+
'
-
'
+
(
'
0
'
if
dt
.
day
<
10
else
''
)
+
str
(
dt
.
day
)
+
'
-
'
+
str
(
dt
.
year
)
date
=
(
'
0
'
if
dt
.
month
<
10
else
''
)
+
str
(
dt
.
month
)
+
'
-
'
+
(
'
0
'
if
dt
.
day
<
10
else
''
)
+
str
(
dt
.
day
)
+
'
-
'
+
str
(
dt
.
year
)
csv_name
=
'
https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/
'
+
date
+
'
.csv
'
csv_name
=
'
https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/
'
+
date
+
'
.csv
'
try
:
try
:
...
@@ -241,4 +248,12 @@ def init_jhu_global(c, conn, src_id):
...
@@ -241,4 +248,12 @@ def init_jhu_global(c, conn, src_id):
dt
+=
datetime
.
timedelta
(
days
=
1
)
dt
+=
datetime
.
timedelta
(
days
=
1
)
# debugging
# debugging
#print(missing_countries_set)
#print(missing_countries_set)
\ No newline at end of file
with
open
(
'
jhu_global.json
'
,
'
w
'
)
as
f
:
f
.
write
(
json
.
dumps
(
prev_death_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_recovered_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_case_dict
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_death_dict_subregion
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_recovered_dict_subregion
)
+
'
\n
'
)
f
.
write
(
json
.
dumps
(
prev_case_dict_subregion
)
+
'
\n
'
)
f
.
close
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment