# Starting libraries we will use 
import pandas as pd               # Pandas for accessing dataset
import numpy as np                # Numpy for manipulating numbers and calculations
import matplotlib.pyplot as plt   # MatplotLib for graphs
import seaborn as sns             # Seaborn for beautifying graphs


data = pd.read_csv("covid-us-counties.csv")
data.sample(10)


# Remove column name 'fips' 
data = data.drop(['fips'], axis = 1)
# Remove all the rows that have NaN values in them
data.dropna()
# Make sure the date is in date format
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data_copy = data # For later reference

data_copy.sample(5)


# Making a copy of the data so we can plot
month_table = data_copy[['date', 'cases', 'deaths']].copy()
#month_table.groupby(month_table['date'].dt.strftime('%B'))[['cases', 'deaths']].apply(sum)
plt.style.use('seaborn')
plt.rc('figure', figsize=(20,8))

# Group data by each month and graph a plot for each month
for month, df in month_table.groupby(month_table['date'].dt.strftime('%B'), sort=False):        
    x = df['date']
    y = df['cases']
    plt.scatter(x, y, cmap = 'Spectral', edgecolor='k', alpha=1, c=df['deaths'])
    plt.xlabel('Date', fontsize=12, fontweight='bold')
    plt.ylabel('Cases', fontsize=12, fontweight='bold')
    plt.title(month, fontsize=14, fontweight='bold')
    plt.show()


# Getting covid data for the entire world 
world_data = pd.read_csv("covid-data.csv")
world_data.sample(5)


# Convert date to date format
import datetime as dt
world_data['date'] = [dt.datetime.strptime(x, '%Y-%m-%d') for x in world_data['date']]


import warnings
warnings.filterwarnings("ignore")
# Lets look at United States data
# We will create a subset from the database
countries = ['United States', 'India', 'Italy']
country_data = world_data[world_data.location.isin(countries)]

# Add a new column for the mortality rate
country_data['mortality_rate'] = country_data.apply(lambda row: row['total_deaths']/
                                                    row['total_cases'], axis=1).fillna(0)

# Make date the index
country_data.set_index('date', inplace=True)
country_data[['location', 'new_cases', 'total_cases', 
              'new_deaths', 'total_deaths', 'mortality_rate']].tail()


plt.style.use('seaborn')

# Graph cases and deaths in new graphs
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,16))

# Plot each graph
country_data.groupby('location')['new_cases'].plot(ax=axes[0,0], legend=True)
country_data.groupby('location')['new_deaths'].plot(ax=axes[0,1], legend=True)
country_data.groupby('location')['total_cases'].plot(ax=axes[1,0], legend=True)
country_data.groupby('location')['total_deaths'].plot(ax=axes[1,1], legend=True)

# Set and label the axis for each plot
axes[0, 0].set_title('New Cases', fontweight='bold')
axes[0, 0].set_xlabel('Date', fontweight ='bold')
axes[0, 0].set_ylabel('Cases', fontweight ='bold')

axes[0, 1].set_title('New Deaths', fontweight='bold')
axes[0, 1].set_xlabel('Date', fontweight ='bold')
axes[0, 1].set_ylabel('Deaths', fontweight ='bold')

axes[1, 0].set_title('Total Cases', fontweight='bold')
axes[1, 0].set_xlabel('Date', fontweight ='bold')
axes[1, 0].set_ylabel('Cases in millions', fontweight ='bold')

axes[1, 1].set_title('Total Deaths', fontweight='bold')
axes[1, 1].set_xlabel('Date', fontweight ='bold')
axes[1, 1].set_ylabel('Deaths', fontweight ='bold')

Text(0, 0.5, 'Deaths')


world_copy = world_data.loc[~(world_data['location'].isin(['World']))]

# Group by location and date
world_copy = pd.DataFrame(world_copy.groupby(['location', 'date'])
                          ['location', 'total_cases', 'total_deaths'].sum()).reset_index()

world_copy = world_copy.sort_values(by = ['location', 'date'], ascending=False)
# Get rid of any duplicates in the dataset
world_copy = world_copy.drop_duplicates(subset = ['location'], keep='first')
# Lets do bar plots of top 10 countries
world_copy.reset_index(drop=True)

# Make a function to plot cases and death vs time
def make_plot(df, group, column, title, x_axis, y_axis):
    fig, ax = plt.subplots(1, 1, figsize=(20, 8))
    pal = sns.color_palette("tab10")
    df = df.sort_values([column], ascending=False).reset_index(drop=True)
    plot = sns.barplot(df[group][0:10], df[column][0:10], palette=pal)
    plot.set_title(("Plot for top 10 {} - ").format(title), fontweight='bold', fontsize=14)
    plot.set_xlabel(x_axis, fontweight='bold', fontsize=12)
    plot.set_ylabel(y_axis, fontweight='bold', fontsize=12)
    
    plt.show()
    
# Make bar plots for total cases and total deaths
make_plot(df=world_copy, group='location', column='total_cases', 
          title='countries with highest cases', 
          x_axis='Countries', y_axis='Total cases in millions')

make_plot(df=world_copy, group='location', column='total_deaths', 
          title='countries with highest deaths', 
          x_axis='Countries', y_axis='Total deaths')


import numpy as np
import scipy

# Function parameters:
# df - the dataframe
# title - the title of teh plot
# Delta: how many days we are chopping off for training
def exponential_data_model(df, title, delta):
    df = df.sort_values(by=['date'], ascending=True)
    df['x'] = np.arange(len(df)) + 1 # Column x in dataframe 
    df['y'] = df['total_cases'] # column y in the dataframe
    
    # Chop off data for validation and testing
    x = df['x'][:-delta] # Remove delta number of data points (so we can predict)
    y = df['y'][:-delta] 
    
    # Use exponential function to fit the curve
    fit_model = scipy.optimize.curve_fit(lambda t, a, b: a*np.exp(b*t), x, y, p0=(100000, 0.1))
    
    # Extract coefficients
    A, B = fit_model[0] # Coefficients
    print("Coefficients =====")
    print("value of A: ", A)
    print("value of B: ", B)
    print("=======================")
    
    x = range(1, df.shape[0] + 1)
    # Fit the model for predictions
    y_fit = A*np.exp(B*x)
    
    # Print the fit model
    fig, ax = plt.subplots(1, 1, figsize=(20, 8))
    g = sns.scatterplot(x=df['x'][:-delta], y=df['y'][:-delta],
                   label='Confirmed cases from the model creation', color='green')
    g = sns.scatterplot(x=df['x'][-delta:], y=df['y'][-delta:],
                   label='Confirmed cases for model validation', color='red')
    g = sns.lineplot(x=x, y=y_fit, label='Predicted Values', color='blue')
    
    # Our dataset includes 330 dates of data 
    # We will create our model using those 330 datapoints 
    # and predict the next 10 based on that model
    x_future = range(330, 340)
    y_future = A*np.exp(B*x_future)
    print("Expected cases for the next 10 days: \n", y_future)         
    plt.xlabel('Days since first case', fontweight='bold', fontsize=14)
    plt.ylabel('Total cases', fontweight='bold', fontsize=14)
    plt.title('Confirmed cases along with projected cases for ' + title, 
              fontweight='bold', fontsize=16)
    plt.xticks(rotation=90)
    ax.grid(color='yellow', linestyle='dotted', linewidth=0.75)
    plt.show()

# Use the world dataset to fit the model and print the plot
world_df = world_data.loc[~(world_data['location'].isin(['World']))]
world_df = pd.DataFrame(world_df.groupby(['location', 'date'])
                          ['total_cases', 'total_deaths'].sum()).reset_index()
world_df = world_df.sort_values(by = ['location', 'date'], ascending=False)

# Only include data for United States
us_data = world_df[world_df['location']=='United States']
us_df = us_data.copy()
exponential_data_model(us_df, "United States", 10)

Coefficients =====
value of A:  494786.69351850136
value of B:  0.010526662169295533
=======================
Expected cases for the next 10 days: 
 [15961346.95912357 16130254.12092161 16300948.70262724 16473449.61918694
 16647775.98571012 16823947.11958729 17001982.54263067 17181901.98323742
 17363725.37857573 17547472.87679419]


# Plotly
import matplotlib.ticker as ticker
import plotly.express as pex
from plotly.offline import plot


# New dataset will cases along with countries latitude and longitudes
confirmed_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
confirmed_deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
confirmed_recovered = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

confirmed_cases.head()


# Rename some columns
confirmed_cases.rename(columns = {"Province/State": "State", 
                                 "Country/Region": "Country"}, inplace = True)
confirmed_deaths.rename(columns = {"Province/State": "State", 
                                 "Country/Region": "Country"}, inplace = True) 
confirmed_recovered.rename(columns = {"Province/State": "State", 
                                 "Country/Region": "Country"}, inplace = True) 

# Replace country that have Mainland China to just China
confirmed_cases['Country'].replace('Mainland China', 'China', inplace=True)
confirmed_deaths['Country'].replace('Mainland China', 'China', inplace=True)
confirmed_recovered['Country'].replace('Mainland China', 'China', inplace=True)

# Get rid of empty data
confirmed_cases[['State']] = confirmed_cases[['State']].fillna('')
confirmed_deaths[['State']] = confirmed_deaths[['State']].fillna('')
confirmed_recovered[['State']] = confirmed_recovered[['State']].fillna('')
confirmed_cases.fillna(0, inplace=True)
confirmed_deaths.fillna(0, inplace=True)
confirmed_recovered.fillna(0, inplace=True)

# Aggregate all the cases so we can see total sum for all countries
cases_count = confirmed_cases.iloc[:, 4:].sum().max()
deaths_count = confirmed_deaths.iloc[:, 4:].sum().max()
recovered_count = confirmed_recovered.iloc[:, 4:].sum().max()

# Print the info 
print('=============================')
print('Confirmed cases:', cases_count)
print('=============================')
print('Confirmed deaths:', deaths_count)
print('=============================')
print('Confirmed recovered:', recovered_count)
print('=============================')

# Make a dataframe consisting of the counts
counts_df = pd.DataFrame({
    'Total cases': [cases_count],
    'Total deaths': [deaths_count],
    'Total recovered': [recovered_count],
    'Active cases': [cases_count - deaths_count - recovered_count]
})

print("Dataframe with the counts-\n")
print(counts_df)

# We can convert all rows to columns in long format using melt
counts_long_df = counts_df.melt(value_vars=['Active cases', 'Total deaths', 'Total recovered'],
                               var_name='status', value_name='count')
counts_long_df['Upper'] = 'Total cases'
print('==============================================================')
print('Dataframe with counts as rows')
print(counts_long_df)

=============================
Confirmed cases: 80783674
=============================
Confirmed deaths: 1764863
=============================
Confirmed recovered: 47320022
=============================
Dataframe with the counts-

   Total cases  Total deaths  Total recovered  Active cases
0     80783674       1764863         47320022      31698789
==============================================================
Dataframe with counts as rows
            status     count        Upper
0     Active cases  31698789  Total cases
1     Total deaths   1764863  Total cases
2  Total recovered  47320022  Total cases


# Make a treemap for the cases using plotly express
fig = pex.treemap(counts_long_df, path=['status'], values='count',
                 color_discrete_sequence=['red', 'blue', 'green'],
                 template='plotly_dark')
fig.show()


from matplotlib.ticker import NullFormatter
from matplotlib.dates import MonthLocator, DateFormatter

# Sum all of the columns except the first four
confirmed_cases_count = confirmed_cases.iloc[:, 4:].sum(axis=0)
confirmed_deaths_count = confirmed_deaths.iloc[:, 4:].sum(axis=0)
confirmed_recovered_count = confirmed_recovered.iloc[:, 4:].sum(axis=0)
confirmed_active_count = confirmed_cases_count-confirmed_deaths_count-confirmed_recovered_count

# Using Seaborn to plot
fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x=confirmed_cases_count.index, y=confirmed_cases_count, sort=False, linewidth=3)
sns.lineplot(x=confirmed_deaths_count.index, y=confirmed_deaths_count, sort=False, linewidth=3)
sns.lineplot(x=confirmed_recovered_count.index, y=confirmed_recovered_count, sort=False, linewidth=3)
sns.lineplot(x=confirmed_active_count.index, y=confirmed_active_count, sort=False, linewidth=3)

# Set xlabel, y label, and titles
plt.title("Covid Confirmed/Deaths/Recovered/Active plot worldwide", fontsize=16, fontweight='bold')
plt.xticks(rotation=90)
plt.xlabel('Date', fontsize=13, fontweight='bold')
plt.ylabel('Number of cases', fontsize=13, fontweight='bold')
ax.legend(['Confirmed Cases', 'Confirmed Deaths', 'Confirmed Recovered', 'Active cases'])

# Set the date format to show data by months
ax.xaxis.set_major_locator(MonthLocator())
ax.xaxis.set_minor_locator(MonthLocator(bymonthday=2))
ax.xaxis.set_major_formatter(NullFormatter())
ax.xaxis.set_minor_formatter(DateFormatter('%b'))

plt.show()


# World plot using latitude and longitude
confirmed_cases_agg = confirmed_cases.groupby('Country').sum().reset_index()
confirmed_cases_agg.loc[:, ['Lat', 'Long']] = confirmed_cases.groupby('Country').mean().reset_index().loc[:, ['Lat', 'Long']]

# Plot only countries with more than 50,000 cases
MIN_CASES = 50000
confirmed_cases_agg['totals'] = confirmed_cases.iloc[:, 4:].sum(axis=1)
confirmed_cases_agg[['Country', 'Lat', 'Long', 'totals']].head()

confirmed_cases_agg = confirmed_cases_agg[confirmed_cases_agg.iloc[:, 3:].max(axis=1) > MIN_CASES]

# Change the cases to long format
confirmed_cases_agg_long = pd.melt(confirmed_cases_agg,
                                   id_vars=confirmed_cases_agg.iloc[:, :3],
                                   var_name='date',
                                   value_vars=confirmed_cases_agg.iloc[:, 3:],
                                   value_name='confirmed_date_cases')

# Use plotly to visualize the cases
fig = pex.scatter_geo(confirmed_cases_agg_long,
                     lat="Lat", lon="Long", color="Country",
                     hover_name="Country", size="confirmed_date_cases",
                     size_max=50, animation_frame="date",
                     template='plotly_dark', projection="natural earth",
                     title="Cases over time")

fig.show()

	date	county	state	fips	cases	deaths
442509	2020-08-17	Washoe	Nevada	32031.0	6297	127.0
764733	2020-11-24	Washakie	Wyoming	56043.0	338	7.0
470814	2020-08-26	Carter	Kentucky	21043.0	113	3.0
432401	2020-08-14	Clay	Minnesota	27027.0	795	40.0
336811	2020-07-15	Creek	Oklahoma	40037.0	231	9.0
248738	2020-06-17	Buffalo	South Dakota	46017.0	55	0.0
155704	2020-05-18	Lafayette	Missouri	29107.0	68	1.0
136132	2020-05-11	Wyoming	West Virginia	54109.0	1	0.0
812527	2020-12-09	Guanica	Puerto Rico	72055.0	213	NaN
124556	2020-05-08	New Haven	Connecticut	9009.0	8887	669.0

	date	county	state	cases	deaths
620744	2020-10-11	LaMoure	North Dakota	124	0.0
276879	2020-06-26	Horry	South Carolina	2582	42.0
740780	2020-11-17	Golden Valley	North Dakota	139	0.0
12657	2020-03-26	Floyd	Georgia	17	1.0
428977	2020-08-13	Iberia	Louisiana	2588	74.0

	iso_code	continent	location	date	total_cases	new_cases	new_cases_smoothed	total_deaths	new_deaths	new_deaths_smoothed	...	gdp_per_capita	extreme_poverty	cardiovasc_death_rate	diabetes_prevalence	female_smokers	male_smokers	handwashing_facilities	hospital_beds_per_thousand	life_expectancy	human_development_index
25029	JPN	Asia	Japan	2020-06-30	18615.0	139.0	105.143	972.0	0.0	1.000	...	39002.223	NaN	79.370	5.72	11.2	33.7	NaN	13.05	84.63	0.909
47425	CHE	Europe	Switzerland	2020-03-05	114.0	24.0	15.143	1.0	NaN	0.000	...	57410.166	NaN	99.739	5.59	22.6	28.9	NaN	4.53	83.78	0.944
29330	LUX	Europe	Luxembourg	2020-08-27	7928.0	0.0	41.571	124.0	0.0	0.000	...	94277.965	0.2	128.275	4.42	20.9	26.0	NaN	4.51	82.25	0.904
43170	SYC	Africa	Seychelles	2020-03-22	7.0	0.0	0.714	NaN	NaN	0.000	...	26382.287	1.1	242.648	10.55	7.1	35.7	NaN	3.60	73.40	0.797
17591	FRA	Europe	France	2020-07-27	222508.0	2488.0	917.000	30214.0	18.0	4.571	...	38605.671	NaN	86.060	4.77	30.1	35.6	NaN	5.98	82.66	0.901

	location	new_cases	total_cases	new_deaths	total_deaths	mortality_rate
date
2020-12-13	United States	191142.0	16334361.0	1389.0	299293.0	0.018323
2020-12-14	United States	192846.0	16527207.0	1484.0	300777.0	0.018199
2020-12-15	United States	198766.0	16725973.0	2984.0	303761.0	0.018161
2020-12-16	United States	247403.0	16973376.0	3668.0	307429.0	0.018112
2020-12-17	United States	236211.0	17209587.0	3345.0	310774.0	0.018058

	Province/State	Country/Region	Lat	Long	...	12/18/20	12/19/20	12/20/20	12/21/20	12/22/20	12/23/20	12/24/20	12/25/20	12/26/20	12/27/20
0	NaN	Afghanistan	33.93911	67.709953	...	49621	49681	49817	50013	50190	50433	50655	50810	50886	51039
1	NaN	Albania	41.15330	20.168300	...	52004	52542	53003	53425	53814	54317	54827	55380	55755	56254
2	NaN	Algeria	28.03390	1.659600	...	94371	94781	95203	95659	96069	96549	97007	97441	97857	98249
3	NaN	Andorra	42.50630	1.521800	...	7519	7560	7577	7602	7633	7669	7699	7756	7806	7821
4	NaN	Angola	-11.20270	17.873900	...	16562	16626	16644	16686	16802	16931	17029	17099	17149	17240

Data Visualization and Analysis of COVID-19

Keshab Acharya

1. Introduction¶

2. Preparing the Data for U.S¶

3. Visualizing US data through plots¶

4. Data Analysis of top impacted countries¶

New cases plot¶

Total cases plot¶

New deaths plot¶

Total deaths plot¶

4.1 Data Analysis using barplots¶

4.2 Predicting future covid cases for US¶

Fit Results¶

Note¶

5. Data Analysis of the Whole World¶

5.1 TreeMap of the world¶

Analysis¶

5.2 Overall Plot of the World¶

5.3 Cases GeoMap of the world¶

6. Conclusion¶

Further Exploration¶