This is intended for educational purposes and is not intended as a realistic analysis of the COVID-19 virus confirmed cases.
git clone https://github.com/CSSEGISandData/COVID-19.git
. If successful, you should see the CSV file in the next cell.Hint: Using genfromtxt
will not work. You will have to extract this by hand. The problem is that there are quotation marks in various entries (either in the country or the state) that you will have to work around.
! head -n 4 COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv
import numpy as np
import math
import copy
import matplotlib.pyplot as plt
s = 'COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv'
us_coords = [[25,-130],[50,-70]]
import csv
date_to_use = '3/18/20'
states = []
countries = []
lats = []
lons = []
nconfirmed = []
reader = csv.reader(open(s), delimiter=',')
ir = 0
for r in reader:
if ir == 0 :
index_to_use = r.index(date_to_use)
else:
state,country,lat,lon = r[0:4]
states.append(state)
countries.append(country)
lats.append(float(lat))
lons.append(float(lon))
nconfirmed .append( float(r[index_to_use]) )
ir += 1
data = np.array( list(zip(lats,lons,nconfirmed) ) )
states = np.array(states)
countries=np.array(countries)
usdata = data[countries == 'US']
selected = (usdata[:,0] > us_coords[0][0]) & (usdata[:,0] < us_coords[1][0]) & (usdata[:,1] > us_coords[0][1]) & (usdata[:,1] < us_coords[1][1])
mainland_us_data = usdata[ selected ]
## Solution:
plt.scatter(mainland_us_data[:,1], mainland_us_data[:,0])
ncentroids = 10
eps = 1e-3
np.random.shuffle(mainland_us_data[:,0:2])
centroids = copy.copy(mainland_us_data[:,0:2][0:ncentroids] )
deltamax = 10000.
plt.scatter(mainland_us_data[:,1], mainland_us_data[:,0])
plt.scatter(centroids[:,1], centroids[:,0], marker='*')
plt.show()
points = mainland_us_data
ii = np.arange(points.shape[0])
jj = np.arange(ncentroids)
i,j = np.meshgrid(ii,jj)
while deltamax > eps:
old_centroids = copy.copy(centroids)
deltavals = np.sqrt(( points[i,0:2] - centroids[j] )**2)
distances = np.linalg.norm( deltavals, axis=2 )
closest_centroid = np.argmin(distances, axis=0)
centroids = np.array([(points[closest_centroid==k,0:2]).mean(axis=0) for k in range(ncentroids)])
deltamax = np.max( old_centroids - centroids)
print(deltamax)
colors = np.array(['r', 'g', 'b', 'y', 'c', 'm', 'darkviolet', 'brown', 'teal', 'sandybrown'])
plt.scatter( points[:,1], points[:,0], c = colors[closest_centroid])
plt.scatter( centroids[:,1], centroids[:,0], c = 'k', s=200, marker='o')