import warnings
from sklearn.exceptions import ConvergenceWarning

# disable future,user,depreciation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=ConvergenceWarning)

import os
import pandas as pd
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

if os.path.exists("data.csv"):
    df = pd.read_csv("data.csv")
else:
    df = pd.read_csv("https://storage.googleapis.com/covid19-open-data/v3/location/US.csv")
    df.to_csv("data.csv")

columns = [
    'date',
    'new_confirmed',
    'new_deceased',
    'new_tested',
    'new_hospitalized_patients',
    'new_intensive_care_patients',
    'new_ventilator_patients',
    'new_persons_vaccinated',
    'new_persons_fully_vaccinated'
]

df = df[columns]

df.date = pd.to_datetime(df.date)

df['breakout'] = df.date - df.date.min()
df['breakout'] = df['breakout'].dt.days

# set nan values with the last valid observation or 0
df = df.fillna(method='ffill').fillna(0)

df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991 entries, 0 to 990
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   date                          991 non-null    datetime64[ns]
 1   new_confirmed                 991 non-null    float64       
 2   new_deceased                  991 non-null    float64       
 3   new_tested                    991 non-null    float64       
 4   new_hospitalized_patients     991 non-null    float64       
 5   new_intensive_care_patients   991 non-null    float64       
 6   new_ventilator_patients       991 non-null    float64       
 7   new_persons_vaccinated        991 non-null    float64       
 8   new_persons_fully_vaccinated  991 non-null    float64       
 9   breakout                      991 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(1)
memory usage: 77.5 KB

# correlation matrix
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, cmap="YlGnBu")

<Axes: >

sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x7fcedce3ebf0>

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV

x = df[[col for col in df.columns if col != 'new_confirmed' and col != 'date']]
y = df['new_confirmed']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

r2_scores = {}

model = LinearRegression()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

score = r2_score(y_test, y_pred)

print(f"Model R^2: {score}")

r2_scores['linear'] = score

# plot predictions vs actual
plt.figure(figsize=(20, 5))
plt.scatter(x_test.breakout, y_pred)
plt.scatter(x_test.breakout, y_test)
plt.legend(labels=["Predicted", "Actual"])

Model R^2: 0.6148192681133351

<matplotlib.legend.Legend at 0x7fced1ef8a60>

model = Pipeline([
    ('poly', PolynomialFeatures()),
    ('model', LinearRegression())
])

params = {
    'poly__degree': [1, 2, 3, 4, 5],
}

search = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)

search.fit(x_train, y_train)

best_model = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_

print(f"Best Score: {best_score}")
print(f"Best Params: {best_params}")

r2_scores['polylinear'] = best_score

y_pred = best_model.predict(x_test)

# plot predictions vs actual
plt.figure(figsize=(20, 5))
plt.scatter(x_test.breakout, y_pred)
plt.scatter(x_test.breakout, y_test)
plt.legend(labels=["Predicted", "Actual"])

Best Score: 0.8604017968416097
Best Params: {'poly__degree': 2}

<matplotlib.legend.Legend at 0x7fcecfc812d0>

model = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', MinMaxScaler()),
    ('model', Ridge())
])

params = {
    'poly__degree': [1, 2, 3, 4, 5],
    'model__alpha': [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100]
}

search = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)

search.fit(x_train, y_train)

best_model = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_

print(f"Best Score: {best_score}")
print(f"Best Params: {best_params}")

r2_scores['polyridge'] = best_score

y_pred = best_model.predict(x_test)

# plot predictions vs actual
plt.figure(figsize=(20, 5))
plt.scatter(x_test.breakout, y_pred)
plt.scatter(x_test.breakout, y_test)
plt.legend(labels=["Predicted", "Actual"])
plt.show()

Best Score: 0.8878717762645195
Best Params: {'model__alpha': 0.1, 'poly__degree': 3}

model = Pipeline([
    #('poly', PolynomialFeatures()),
    # we will leave out polynomial features for now
    # so we can see if any features are not important
    ('scaler', MinMaxScaler()),
    ('model', Lasso())
])

params = {
    #'poly__degree': [1, 2, 3, 4, 5],
    'model__alpha': [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100]
}

search = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)

search.fit(x_train, y_train)

best_model = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_

print(f"Best Score: {best_score}")
print(f"Best Params: {best_params}")
print(f"Model Coefficients: {best_model['model'].coef_}")

r2_scores['lasso'] = best_score

y_pred = best_model.predict(x_test)

# plot predictions vs actual
plt.figure(figsize=(20, 5))
plt.scatter(x_test.breakout, y_pred)
plt.scatter(x_test.breakout, y_test)
plt.legend(labels=["Predicted", "Actual"])
plt.show()

Best Score: 0.6684711441421003
Best Params: {'model__alpha': 100}
Model Coefficients: [     -0.           69726.29836806  549692.71633038 -235830.42772072
  -29252.14225937  -94295.30754802  -24356.24698644       0.        ]

model = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', MinMaxScaler()),
    ('model', Lasso())
])

params = {
    'poly__degree': [1, 2, 3, 4, 5],
    'model__alpha': [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100]
}

search = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)

search.fit(x_train, y_train)

best_model = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_

print(f"Best Score: {best_score}")
print(f"Best Params: {best_params}")
print(f"Model Coefficients: {len(best_model['model'].coef_)}")
print(f"Total Zeroed Coefficients: {len([coef for coef in best_model['model'].coef_ if coef == 0])}")

r2_scores['polylasso'] = best_score

y_pred = best_model.predict(x_test)

# plot predictions vs actual
plt.figure(figsize=(20, 5))
plt.scatter(x_test.breakout, y_pred)
plt.scatter(x_test.breakout, y_test)
plt.legend(labels=["Predicted", "Actual"])
plt.show()

Best Score: 0.8951442817507843
Best Params: {'model__alpha': 50, 'poly__degree': 4}
Model Coefficients: 495
Total Zeroed Coefficients: 457

model = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', MinMaxScaler()),
    ('model', ElasticNet())
])

params = {
    'poly__degree': [1, 2, 3, 4, 5],
    'model__alpha': [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100],
    'model__l1_ratio': [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100]
}

search = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)

search.fit(x_train, y_train)

best_model = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_

print(f"Best Score: {best_score}")
print(f"Best Params: {best_params}")
print(f"Model Coefficients: {len(best_model['model'].coef_)}")
print(f"Total Zeroed Coefficients: {len([coef for coef in best_model['model'].coef_ if coef == 0])}")

r2_scores['polyelastic'] = best_score

y_pred = best_model.predict(x_test)

# plot predictions vs actual
plt.figure(figsize=(20, 5))
plt.scatter(x_test.breakout, y_pred)
plt.scatter(x_test.breakout, y_test)
plt.legend(labels=["Predicted", "Actual"])
plt.show()

Best Score: 0.8951442817507843
Best Params: {'model__alpha': 50, 'model__l1_ratio': 1, 'poly__degree': 4}
Model Coefficients: 495
Total Zeroed Coefficients: 457

# plot r2 scores
plt.figure(figsize=(7, 3))
plt.bar(r2_scores.keys(), r2_scores.values())

print('best models: ', sorted(r2_scores.items(), key=lambda x: x[1], reverse=True)[:3])

best models:  [('polylasso', 0.8951442817507843), ('polyelastic', 0.8951442817507843), ('polyridge', 0.8878717762645195)]

	date	breakout
0	2020-01-01	0
1	2020-01-02	1
2	2020-01-03	2
3	2020-01-04	3
4	2020-01-05	4

COVID19 Dataset¶

Modeling¶

Modeling!¶

Conclusion¶