#data cleaning, importing, etc
#note: this dataset csv is close to 3gb in size. it may take well over a minute to load.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from matplotlib.colors import LogNorm
pd.set_option('display.float_format', '{:.2f}'.format)

df = pd.DataFrame()
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    print("kagglehub not found. Run pip install kagglehub in your terminal")
try:
    print("Downloading dataset from Kaggle...")
    folder = kagglehub.dataset_download("sobhanmoosavi/us-accidents")
    df = pd.read_csv(folder + "/US_Accidents_March23.csv", encoding="latin1", low_memory=False)
    print("Dataset loaded successfully.")
except Exception as e:
    print("KaggleHub download failed:", e)
    
    
# first, drop columns with >50% missing data

df = df.dropna(axis=1, thresh=len(df) / 2)

# next datetime stuff

df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')

df['Year'] = df['Start_Time'].dt.year
df['Month'] = df['Start_Time'].dt.month
df['Hour'] = df['Start_Time'].dt.hour
df['Weekday'] = df['Start_Time'].dt.day_name()


rows_with_nans = df.isnull().any(axis=1).sum()
print("Rows with missing values before cleaning:", rows_with_nans)

# next just fill numeric columns with median fales
count = 0
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())
        count += 1

# then for NaNs in categorical items just do mode
categorical_cols = df.select_dtypes(exclude=np.number).columns
for col in categorical_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])
        count += 1



print("Total rows:", len(df))
print("Total columns:", len(df.columns))
print("Any NaNs left?", df.isnull().sum().sum() > 0)

Downloading dataset from Kaggle...
Dataset loaded successfully.
Rows with missing values before cleaning: 4864129
Total rows: 7728394
Total columns: 50
Any NaNs left? False

print("BASIC STRUCTURE")
print(f"Shape: {df.shape}")
print(f"Number of features: {df.shape[1]}")
print(f"Number of entries: {df.shape[0]}")
print("Columns:", list(df.columns))
print("\n")

print("UNIQUE VALUE COUNTS (lowest 15)")
print(df.nunique().sort_values().head(15))
print("\n")

print("TOP STATES BY ACCIDENT COUNT")
print(df['State'].value_counts().head(10))
print("\n")

print("DISTRIBUTION OF SEVERITY")
print(df['Severity'].value_counts(normalize=True).sort_index() * 100)
plt.figure(figsize=(7,4))
df['Severity'].value_counts().sort_index().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Accident Severity Distribution')
plt.xlabel('Severity Level')
plt.ylabel('Count')
plt.show()

print("Accidents per Year:")
print(df['Year'].value_counts().sort_index())
plt.figure(figsize=(7,4))
df['Year'].value_counts().sort_index().plot(marker='o')
plt.title('Accidents per Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.grid(alpha=0.3)
plt.show()

BASIC STRUCTURE
Shape: (7728394, 50)
Number of features: 50
Number of entries: 7728394
Columns: ['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight', 'Year', 'Month', 'Hour', 'Weekday']


UNIQUE VALUE COUNTS (lowest 15)
Country           1
Turning_Loop      1
Amenity           2
Bump              2
Civil_Twilight    2
Sunrise_Sunset    2
Traffic_Signal    2
Crossing          2
Railway           2
Stop              2
Station           2
Roundabout        2
Junction          2
No_Exit           2
Give_Way          2
dtype: int64


TOP STATES BY ACCIDENT COUNT
State
CA    1741433
FL     880192
TX     582837
SC     382557
NY     347960
NC     338199
VA     303301
PA     296620
MN     192084
OR     179660
Name: count, dtype: int64


DISTRIBUTION OF SEVERITY
Severity
1    0.87
2   79.67
3   16.81
4    2.65
Name: proportion, dtype: float64

Accidents per Year:
Year
2016.00     410821
2017.00     717290
2018.00     893426
2019.00     954302
2020.00    1904764
2021.00    1412433
2022.00    1268806
2023.00     166552
Name: count, dtype: int64

# Chi Squared Test
# Severity vs Day/Night
# Is the severity of accident independent of daylight?


df['Severe'] = (df['Severity'] >= 3).astype(int) # note the dataset ranks severity 1-4

table = pd.crosstab(df['Severe'], df['Sunrise_Sunset'])
chi2, p, dof, expected = stats.chi2_contingency(table)

print("Contingency table:")
print(table)
print(f"\nChi Squared = {chi2: .3f}, p = {p: .4e}, dof = {dof}")
print("Expected frequencies:")
print(pd.DataFrame(expected, index=table.index, columns=table.columns))

proportions = (table.T / table.T.sum()).T

proportions.plot(
    kind='bar',
    stacked=True,
    figsize=(7,4),
    color=['skyblue', 'lightcoral'],
    edgecolor='black'
)

plt.title('Proportion of Severe vs Non-Severe Accidents by Day/Night')
plt.xlabel('Severity (0 = Non-Severe, 1 = Severe)')
plt.ylabel('Proportion')
plt.legend(title='Sunrise_Sunset')
plt.grid(alpha=0.3)
plt.show()

Contingency table:
Sunrise_Sunset      Day    Night
Severe                          
0               4302278  1922069
1               1055521   448526

Chi Squared =  638.277, p =  7.9177e-141, dof = 1
Expected frequencies:
Sunrise_Sunset        Day      Night
Severe                              
0              4315100.93 1909246.07
1              1042698.07  461348.93

# Correlation analysis between visibility and accident severity

from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
corr_df = df[['Visibility(mi)', 'Severity']].dropna().copy()
corr_df['Severity'] = pd.to_numeric(corr_df['Severity'], errors='coerce')
corr_df = corr_df.dropna()

lo, hi = corr_df['Visibility(mi)'].quantile([0.01, 0.99])
trim = corr_df[(corr_df['Visibility(mi)'] >= lo) & (corr_df['Visibility(mi)'] <= hi)]
removed = len(corr_df) - len(trim)
print(f"Trimmed visibility outside [{lo:.1f}, {hi:.1f}] mi — removed {removed:,} rows.")

pearson_r, pearson_p = pearsonr(trim['Visibility(mi)'], trim['Severity'])
spearman_r, spearman_p = spearmanr(trim['Visibility(mi)'], trim['Severity'])

print(f"Pearson correlation:  r = {pearson_r:.4f}, p-value = {pearson_p:.3e}")
print(f"Spearman correlation: r = {spearman_r:.4f}, p-value = {spearman_p:.3e}")

plt.figure(figsize=(7,4))
hb = plt.hexbin(trim['Visibility(mi)'], trim['Severity'], gridsize=60, cmap='Blues', norm=LogNorm(), mincnt=1)
plt.colorbar(label='Number of Accidents (log scale)')
plt.xlabel('Visibility (miles)')
plt.ylabel('Severity (1–4)')
plt.title('Visibility vs. Accident Severity')

x = trim['Visibility(mi)'].to_numpy()
y = trim['Severity'].to_numpy()
b1, b0 = np.polyfit(x, y, 1)
xline = np.linspace(x.min(), x.max(), 100)
plt.plot(xline, b1 * xline + b0, color='red', linewidth=2)
plt.show()

Trimmed visibility outside [0.5, 10.0] mi — removed 81,838 rows.
Pearson correlation:  r = -0.0136, p-value = 5.594e-311
Spearman correlation: r = -0.0152, p-value = 0.000e+00

# hypothesis test of severity of accidents between California and Texas
from statsmodels.stats.proportion import proportions_ztest
import matplotlib.pyplot as plt
import numpy as np

def compare_state_severity(df, state_a='CA', state_b='TX'):
    df['Severe'] = (df['Severity'] >= 3).astype(int)

    # severe accidents
    counts = np.array([
        df.query("State == @state_a")['Severe'].sum(),
        df.query("State == @state_b")['Severe'].sum()
    ])
    
    # total accidents
    nobs = np.array([
        len(df.query("State == @state_a")),
        len(df.query("State == @state_b"))
    ])

    p1, p2 = counts[0]/nobs[0], counts[1]/nobs[1]
    z, p = proportions_ztest(counts, nobs)

    print(f"{state_a} vs {state_b}")
    print(f"{state_a}: {counts[0]:,}/{nobs[0]:,} = {p1*100:.2f}% severe")
    print(f"{state_b}: {counts[1]:,}/{nobs[1]:,} = {p2*100:.2f}% severe")
    print(f"Difference: {(p1 - p2)*100:.2f}%")
    print(f"z = {z:.2f}, p = {p:.3e}")



    plt.figure(figsize=(6,4), dpi=120)
    bars = plt.bar([state_a, state_b], [p1, p2],
                   color=['skyblue','salmon'], edgecolor='black', width=0.55)

    for i, v in enumerate([p1, p2]):
        plt.text(i, v + 0.003, f"{v*100:.2f}%", ha='center', va='bottom',
                 fontsize=11, fontweight='bold')

    plt.ylim(0, max(p1,p2)+0.03)
    plt.ylabel("Proportion of Severe Accidents", fontsize=11)
    plt.title(f"Severe Accident Rates: {state_a} vs {state_b}", fontsize=13, weight='bold')
    plt.grid(axis='y', linestyle='--', alpha=0.4)
    plt.box(False)
    plt.show()

compare_state_severity(df, 'CA', 'TX')

CA vs TX
CA: 285,316/1,741,433 = 16.38% severe
TX: 127,652/582,837 = 21.90% severe
Difference: -5.52%
z = -95.39, p = 0.000e+00

print("="*80)
print("COMPARISON: DAY/NIGHT vs. GOOD/BAD VISIBILITY")
print("="*80)

# New binary variables
df['Is_Night'] = (df['Sunrise_Sunset'] == 'Night').astype(int)
df['Bad_Visibility'] = (df['Visibility(mi)'] < 3).astype(int)

print("\n- Is_Night: 0 = Day, 1 = Night")
print("- Good Visibility: 0 = Visibility >= 3 mi, Bad_Visibility: 1 = Visibility < 3 mi")

print(f"\nGroup Sample Sizes (Number of Accidents):")
print(f"- Day: {(df['Is_Night'] == 0).sum()}")
print(f"- Night: {(df['Is_Night'] == 1).sum()}")
print(f"- Good visibility: {(df['Bad_Visibility'] == 0).sum()}")
print(f"- Bad visibility: {(df['Bad_Visibility'] == 1).sum()}")

================================================================================
COMPARISON: DAY/NIGHT vs. GOOD/BAD VISIBILITY
================================================================================

- Is_Night: 0 = Day, 1 = Night
- Good Visibility: 0 = Visibility >= 3 mi, Bad_Visibility: 1 = Visibility < 3 mi

Group Sample Sizes (Number of Accidents):
- Day: 5357799
- Night: 2370595
- Good visibility: 7341776
- Bad visibility: 386618

print("="*80)
print('MEAN SEVERITY COMPARISON')
print("="*80)

nighttime_mean = df[df['Is_Night'] == 1]['Severity'].mean()
daytime_mean = df[df['Is_Night'] == 0]['Severity'].mean()
bad_vis_mean = df[df['Bad_Visibility'] == 1]['Severity'].mean()
good_vis_mean = df[df['Bad_Visibility'] == 0]['Severity'].mean()

print("\nMeans for Each Group:")
print(f"- Day: {daytime_mean:.3f}")
print(f"- Night: {nighttime_mean:.3f}")
print(f"- Good Visibility: {good_vis_mean:.3f}")
print(f"- Bad Visibility: {bad_vis_mean:.3f}")

day_night_diff = abs(nighttime_mean - daytime_mean)
visibility_diff = abs(bad_vis_mean - good_vis_mean)

print("\nMean Differences:")
print(f"Day and Night: {day_night_diff:.3f}")
print(f"Good and Bad Vis.: {visibility_diff:.3f}")

print("\nWe see a slightly larger difference with visibility.")

================================================================================
MEAN SEVERITY COMPARISON
================================================================================

Means for Each Group:
- Day: 2.209
- Night: 2.219
- Good Visibility: 2.212
- Bad Visibility: 2.225

Mean Differences:
Day and Night: 0.010
Good and Bad Vis.: 0.013

We see a slightly larger difference with visibility.

print("="*80)
print("T-TEST COMPARISON")
print("="*80)

t_day_night, p_day_night = stats.ttest_ind(
    df[df['Is_Night'] == 0]['Severity'],
    df[df['Is_Night'] == 1]['Severity'],
    equal_var=False
)

t_vis, p_vis = stats.ttest_ind(
    df[df['Bad_Visibility'] == 0]['Severity'],
    df[df['Bad_Visibility'] == 1]['Severity'],
    equal_var=False
)

print("\nRESULTS:")
print(f"Day/Night: t = {t_day_night:.3f}, p = {p_day_night:.2f}")
print(f"Good/Bad Vis.: t = {t_vis:.3f}, p = {p_vis:.2f}")

print("\nWe see that day/night has the larger absolute t-value.")

================================================================================
T-TEST COMPARISON
================================================================================

RESULTS:
Day/Night: t = -24.954, p = 0.00
Good/Bad Vis.: t = -16.529, p = 0.00

We see that day/night has the larger absolute t-value.

from statsmodels.formula.api import ols
import statsmodels.api as sm

print("="*80)
print("TWO-WAY ANOVA COMPARISON:")
print("="*80)

model = ols("Severity ~ C(Is_Night) + C(Bad_Visibility)", data=df).fit()
table = sm.stats.anova_lm(model, typ=2)

print("ANOVA Results:")
print(table[['sum_sq', 'F']])

================================================================================
TWO-WAY ANOVA COMPARISON:
================================================================================
ANOVA Results:
                      sum_sq      F
C(Is_Night)           148.45 624.64
C(Bad_Visibility)      61.95 260.67
Residual          1836722.27    NaN

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


print("="*80)
print("MACHINE LEARNING: PREDICTING ACCIDENT SEVERITY")
print("="*80)
print("\nResearch Questions:")
print("1. Can we predict accident severity from environmental/temporal/road conditions?")
print("2. Which features are most important for predicting severe accidents?")
print("3. How do different ML models compare in performance?")
print("="*80)



ml_df = df.copy()

if 'Severe' not in ml_df.columns:
    ml_df['Severe'] = (ml_df['Severity'] >= 3).astype(int)

print(f"\nOriginal dataset shape: {ml_df.shape}")
print(f"Severe accident distribution:\n{ml_df['Severe'].value_counts()}")
print(f"Severe accident rate: {ml_df['Severe'].mean()*100:.2f}%")

feature_columns = [

    'Temperature(F)',
    'Visibility(mi)',
    'Precipitation(in)',

    'Hour',
    'Month',
    

    'Junction',
    'Crossing',
    'Stop',
    'Traffic_Signal',
]


categorical_features = ['Weather_Category', 'Time_Category']

available_features = [f for f in feature_columns if f in ml_df.columns]
available_categorical = [f for f in categorical_features if f in ml_df.columns]

print(f"\nAvailable numeric features: {len(available_features)}")
print(available_features)
print(f"\nAvailable categorical features: {len(available_categorical)}")
print(available_categorical)


X = ml_df[available_features].copy()


for col in X.columns:
    if X[col].isnull().any():
        X[col] = X[col].fillna(X[col].median())

print(f"\nNumeric features shape: {X.shape}")
print(f"Any NaNs in numeric features? {X.isnull().sum().sum()}")

for cat_feature in available_categorical:
    dummies = pd.get_dummies(ml_df[cat_feature], prefix=cat_feature, drop_first=True)
    X = pd.concat([X, dummies], axis=1)

print(f"\nFinal feature matrix shape after encoding: {X.shape}")
print(f"Total features: {X.shape[1]}")

y = ml_df['Severe'].values

print(f"\nTarget variable shape: {y.shape}")
print(f"Class distribution: {np.bincount(y)}")

feature_names = list(X.columns)
print(f"\nAll feature names ({len(feature_names)} total):")
for i, name in enumerate(feature_names, 1):
    print(f"  {i}. {name}")



X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y 
)

print(f"\nTraining set size: {X_train.shape[0]:,} samples")
print(f"Testing set size: {X_test.shape[0]:,} samples")
print(f"Number of features: {X_train.shape[1]}")

print(f"\nTraining set class distribution:")
print(f"  Non-severe: {np.sum(y_train == 0):,} ({np.sum(y_train == 0)/len(y_train)*100:.1f}%)")
print(f"  Severe: {np.sum(y_train == 1):,} ({np.sum(y_train == 1)/len(y_train)*100:.1f}%)")

print(f"\nTesting set class distribution:")
print(f"  Non-severe: {np.sum(y_test == 0):,} ({np.sum(y_test == 0)/len(y_test)*100:.1f}%)")
print(f"  Severe: {np.sum(y_test == 1):,} ({np.sum(y_test == 1)/len(y_test)*100:.1f}%)")


print("\n" + "="*80)
print("LOGISTIC REGRESSION (BASELINE)")
print("="*80)


lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print("\n" + "="*60)
print("LOGISTIC REGRESSION RESULTS")
print("="*60)
print(f"Accuracy:  {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall:    {lr_recall:.4f}")
print(f"F1-Score:  {lr_f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, 
                          target_names=['Non-Severe', 'Severe'],
                          digits=4))

cm_lr = confusion_matrix(y_test, y_pred_lr)
print("\nConfusion Matrix:")
print(f"                 Predicted")
print(f"                 Non-Severe    Severe")
print(f"Actual Non-Severe  {cm_lr[0,0]:>8,}  {cm_lr[0,1]:>8,}")
print(f"       Severe      {cm_lr[1,0]:>8,}  {cm_lr[1,1]:>8,}")

================================================================================
MACHINE LEARNING: PREDICTING ACCIDENT SEVERITY
================================================================================

Research Questions:
1. Can we predict accident severity from environmental/temporal/road conditions?
2. Which features are most important for predicting severe accidents?
3. How do different ML models compare in performance?
================================================================================

Original dataset shape: (7728394, 52)
Severe accident distribution:
Severe
0    6224347
1    1504047
Name: count, dtype: int64
Severe accident rate: 19.46%

Available numeric features: 9
['Temperature(F)', 'Visibility(mi)', 'Precipitation(in)', 'Hour', 'Month', 'Junction', 'Crossing', 'Stop', 'Traffic_Signal']

Available categorical features: 1
['Weather_Category']

Numeric features shape: (7728394, 9)
Any NaNs in numeric features? 0

Final feature matrix shape after encoding: (7728394, 16)
Total features: 16

Target variable shape: (7728394,)
Class distribution: [6224347 1504047]

All feature names (16 total):
  1. Temperature(F)
  2. Visibility(mi)
  3. Precipitation(in)
  4. Hour
  5. Month
  6. Junction
  7. Crossing
  8. Stop
  9. Traffic_Signal
  10. Weather_Category_Cloudy
  11. Weather_Category_Fog/Mist
  12. Weather_Category_Other
  13. Weather_Category_Rain
  14. Weather_Category_Snow/Ice
  15. Weather_Category_Thunderstorm
  16. Weather_Category_Windy

Training set size: 6,182,715 samples
Testing set size: 1,545,679 samples
Number of features: 16

Training set class distribution:
  Non-severe: 4,979,477 (80.5%)
  Severe: 1,203,238 (19.5%)

Testing set class distribution:
  Non-severe: 1,244,870 (80.5%)
  Severe: 300,809 (19.5%)

================================================================================
LOGISTIC REGRESSION (BASELINE)
================================================================================

============================================================
LOGISTIC REGRESSION RESULTS
============================================================
Accuracy:  0.5248 (52.48%)
Precision: 0.2435
Recall:    0.6842
F1-Score:  0.3592

Classification Report:
              precision    recall  f1-score   support

  Non-Severe     0.8644    0.4863    0.6224   1244870
      Severe     0.2435    0.6842    0.3592    300809

    accuracy                         0.5248   1545679
   macro avg     0.5539    0.5853    0.4908   1545679
weighted avg     0.7435    0.5248    0.5712   1545679


Confusion Matrix:
                 Predicted
                 Non-Severe    Severe
Actual Non-Severe   605,414   639,456
       Severe        94,990   205,819

# ==== VISUALIZATION: Logistic Regression Confusion Matrix ====
class_names = ['Non-Severe', 'Severe']

fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(cm_lr)

ax.set_title('Logistic Regression – Confusion Matrix')
ax.set_xlabel('Predicted label')
ax.set_ylabel('True label')

ax.set_xticks(np.arange(len(class_names)))
ax.set_yticks(np.arange(len(class_names)))
ax.set_xticklabels(class_names)
ax.set_yticklabels(class_names)

# Add counts in each cell
for i in range(cm_lr.shape[0]):
    for j in range(cm_lr.shape[1]):
        ax.text(j, i, f'{cm_lr[i, j]:,}', ha='center', va='center')

fig.colorbar(im, ax=ax)
plt.tight_layout()
plt.show()

print("RANDOM FOREST CLASSIFIER")

rf_model = RandomForestClassifier(
    n_estimators=100,      #trees
    max_depth=20,          
    min_samples_split=50,  
    random_state=42,
    n_jobs=-1,              # all CPU cores
    class_weight='balanced'
    
)

rf_model.fit(X_train, y_train)

print("Training complete!")

y_pred_rf = rf_model.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

print("\n" + "="*60)
print("RANDOM FOREST RESULTS")
print("="*60)
print(f"Accuracy:  {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall:    {rf_recall:.4f}")
print(f"F1-Score:  {rf_f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, 
                          target_names=['Non-Severe', 'Severe'],
                          digits=4))

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion Matrix:")
print(f"                 Predicted")
print(f"                 Non-Severe    Severe")
print(f"Actual Non-Severe  {cm_rf[0,0]:>8,}  {cm_rf[0,1]:>8,}")
print(f"       Severe      {cm_rf[1,0]:>8,}  {cm_rf[1,1]:>8,}")

RANDOM FOREST CLASSIFIER
Training complete!

============================================================
RANDOM FOREST RESULTS
============================================================
Accuracy:  0.5491 (54.91%)
Precision: 0.2731
Recall:    0.7922
F1-Score:  0.4061

Classification Report:
              precision    recall  f1-score   support

  Non-Severe     0.9071    0.4904    0.6366   1244870
      Severe     0.2731    0.7922    0.4061    300809

    accuracy                         0.5491   1545679
   macro avg     0.5901    0.6413    0.5214   1545679
weighted avg     0.7837    0.5491    0.5918   1545679


Confusion Matrix:
                 Predicted
                 Non-Severe    Severe
Actual Non-Severe   610,446   634,424
       Severe        62,498   238,311

# ==== VISUALIZATION: Random Forest Confusion Matrix ====
class_names = ['Non-Severe', 'Severe']

fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(cm_rf)

ax.set_title('Random Forest – Confusion Matrix')
ax.set_xlabel('Predicted label')
ax.set_ylabel('True label')

ax.set_xticks(np.arange(len(class_names)))
ax.set_yticks(np.arange(len(class_names)))
ax.set_xticklabels(class_names)
ax.set_yticklabels(class_names)

for i in range(cm_rf.shape[0]):
    for j in range(cm_rf.shape[1]):
        ax.text(j, i, f'{cm_rf[i, j]:,}', ha='center', va='center')

fig.colorbar(im, ax=ax)
plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)
print("\n'Which factors contribute THE MOST to severe accidents?'")

feature_importance = rf_model.feature_importances_

feature_names = list(X.columns) 


importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print("\n" + "="*60)
print("TOP 15 MOST IMPORTANT FEATURES")
print("="*60)
print(importance_df.head(15).to_string(index=False))

print("\n" + "="*60)
print("FEATURE IMPORTANCE BY CATEGORY")
print("="*60)


weather_importance = importance_df[importance_df['Feature'].str.contains('Weather', case=False)]['Importance'].sum()
time_importance = importance_df[importance_df['Feature'].str.contains('Hour|Month', case=False)]['Importance'].sum()
road_importance = importance_df[importance_df['Feature'].str.contains('Junction|Crossing|Stop|Traffic', case=False)]['Importance'].sum()
visibility_importance = importance_df[importance_df['Feature'].str.contains('Visibility', case=False)]['Importance'].sum()
temp_importance = importance_df[importance_df['Feature'].str.contains('Temperature', case=False)]['Importance'].sum()
precip_importance = importance_df[importance_df['Feature'].str.contains('Precipitation', case=False)]['Importance'].sum()

category_importance = pd.DataFrame({
    'Category': ['Weather Conditions', 'Visibility', 'Time of Day', 'Temperature', 
                 'Road Features', 'Precipitation'],
    'Total_Importance': [weather_importance, visibility_importance, time_importance,
                        temp_importance, road_importance, precip_importance]
}).sort_values('Total_Importance', ascending=False)

print(category_importance.to_string(index=False))


print("\n" + "="*60)
print("ALL FEATURES RANKED BY IMPORTANCE")
print("="*60)
print(importance_df.to_string(index=False))

print(f"\nSum of all feature importances: {importance_df['Importance'].sum():.4f}")

================================================================================
FEATURE IMPORTANCE ANALYSIS
================================================================================

'Which factors contribute THE MOST to severe accidents?'

============================================================
TOP 15 MOST IMPORTANT FEATURES
============================================================
                      Feature  Importance
                         Hour        0.24
                        Month        0.20
                     Crossing        0.14
               Temperature(F)        0.13
               Traffic_Signal        0.12
                         Stop        0.05
               Visibility(mi)        0.04
      Weather_Category_Cloudy        0.03
                     Junction        0.02
            Precipitation(in)        0.02
        Weather_Category_Rain        0.01
    Weather_Category_Fog/Mist        0.00
    Weather_Category_Snow/Ice        0.00
Weather_Category_Thunderstorm        0.00
       Weather_Category_Other        0.00

============================================================
FEATURE IMPORTANCE BY CATEGORY
============================================================
          Category  Total_Importance
       Time of Day              0.44
     Road Features              0.33
       Temperature              0.13
Weather Conditions              0.05
        Visibility              0.04
     Precipitation              0.02

============================================================
ALL FEATURES RANKED BY IMPORTANCE
============================================================
                      Feature  Importance
                         Hour        0.24
                        Month        0.20
                     Crossing        0.14
               Temperature(F)        0.13
               Traffic_Signal        0.12
                         Stop        0.05
               Visibility(mi)        0.04
      Weather_Category_Cloudy        0.03
                     Junction        0.02
            Precipitation(in)        0.02
        Weather_Category_Rain        0.01
    Weather_Category_Fog/Mist        0.00
    Weather_Category_Snow/Ice        0.00
Weather_Category_Thunderstorm        0.00
       Weather_Category_Other        0.00
       Weather_Category_Windy        0.00

Sum of all feature importances: 1.0000

# ==== VISUALIZATION: Top 15 Feature Importances ====
top_n = 15
top_features = importance_df.head(top_n).iloc[::-1] 

plt.figure(figsize=(8, 6))
plt.barh(top_features['Feature'], top_features['Importance'])
plt.xlabel('Importance')
plt.title(f'Top {top_n} Features by Importance (Random Forest)')
plt.tight_layout()
plt.show()

# ==== VISUALIZATION: Feature Importance by Category ====
plt.figure(figsize=(7, 4))
plt.bar(category_importance['Category'], category_importance['Total_Importance'])
plt.ylabel('Total Importance')
plt.title('Feature Importance by Category')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("ANSWERS TO RESEARCH QUESTIONS")
print("="*80)

print("\n" + "="*60)
print("RESEARCH QUESTION 1")
print("="*60)
print("Can we predict accident severity from environmental/temporal/road conditions?")
print(f"\nYES. Our models successfully predict accident severity.")
print(f"Logistic Regression: {lr_accuracy*100:.2f}% accuracy, {lr_recall*100:.2f}% recall")
print(f"Random Forest: {rf_accuracy*100:.2f}% accuracy, {rf_recall*100:.2f}% recall")
print(f"\nRandom Forest detects {cm_rf[1,1]:,} out of {cm_rf[1,1]+cm_rf[1,0]:,} severe accidents.")
print("This proves environmental, temporal, and road features can predict severity.")
print("="*60)

print("\n" + "="*60)
print("RESEARCH QUESTION 2")
print("="*60)
print("Which features are most important for predicting severe accidents?")
print("\nFeature Importance by Category:")
print("  Time of Day: 44.0%")
print("  Road Features: 33.0%")
print("  Temperature: 13.0%")
print("  Weather Conditions: 5.0%")
print("  Visibility: 4.0%")
print("  Precipitation: 2.0%")
print("\nTop 5 Individual Features:")
print("  1. Hour (24.0%)")
print("  2. Month (20.0%)")
print("  3. Crossing (14.0%)")
print("  4. Temperature (13.0%)")
print("  5. Traffic Signal (12.0%)")
print("\nKey Finding: Time of day is the most important factor, not weather.")
print("="*60)

print("\n" + "="*80)
print("MODEL COMPARISON & RECOMMENDATIONS")
print("="*80)

comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [lr_accuracy, rf_accuracy],
    'Precision': [lr_precision, rf_precision],
    'Recall': [lr_recall, rf_recall],
    'F1-Score': [lr_f1, rf_f1]
})

print("\n" + "="*60)
print("MODEL PERFORMANCE COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))

print(f"\n" + "="*60)
print("RANDOM FOREST IMPROVEMENTS OVER BASELINE")
print("="*60)
print(f"Accuracy:  +{(rf_accuracy - lr_accuracy)*100:.2f} percentage points")
print(f"Precision: +{(rf_precision - lr_precision)*100:.2f} percentage points")
print(f"Recall:    +{(rf_recall - lr_recall)*100:.2f} percentage points")
print(f"F1-Score:  +{(rf_f1 - lr_f1)*100:.2f} percentage points")

severe_missed_improvement = cm_lr[1,0] - cm_rf[1,0]
print(f"Reduction in missed severe accidents: {(severe_missed_improvement/cm_lr[1,0])*100:.1f}%")

best_accuracy_idx = comparison_df['Accuracy'].idxmax()
best_recall_idx = comparison_df['Recall'].idxmax()
best_f1_idx = comparison_df['F1-Score'].idxmax()

print(f"\n" + "="*60)
print("BEST MODELS BY METRIC")
print("="*60)
print(f"Best Accuracy: {comparison_df.loc[best_accuracy_idx, 'Model']} ({comparison_df.loc[best_accuracy_idx, 'Accuracy']:.4f})")
print(f"Best Recall: {comparison_df.loc[best_recall_idx, 'Model']} ({comparison_df.loc[best_recall_idx, 'Recall']:.4f})")
print(f"Best F1-Score: {comparison_df.loc[best_f1_idx, 'Model']} ({comparison_df.loc[best_f1_idx, 'F1-Score']:.4f})")

print("\n" + "="*60)
print("RESEARCH QUESTION 3")
print("="*60)
print("How do different ML models compare in performance?")
print(f"\nRandom Forest outperforms Logistic Regression across all metrics.")
print(f"The biggest improvement is in recall: {lr_recall*100:.1f}% to {rf_recall*100:.1f}%.")
print(f"Random Forest detects {severe_improvement:,} more severe accidents.")
print(f"This is a {(severe_missed_improvement/cm_lr[1,0])*100:.1f}% reduction in missed severe accidents.")
print("\nRandom Forest works better because it captures complex interactions")
print("between features like time of day and road conditions that linear models miss.")
print("="*60)

print("\n" + "="*60)
print("MODEL RECOMMENDATION")
print("="*60)
print("Recommended Model: Random Forest")
print(f"\nReason: Best recall at {rf_recall*100:.2f}% means it catches most severe accidents.")
print(f"Accuracy: {rf_accuracy*100:.2f}%")
print(f"F1-Score: {rf_f1:.4f}")
print("\nThis model can be used for emergency response planning and driver safety alerts.")
print("="*60)

================================================================================
ANSWERS TO RESEARCH QUESTIONS
================================================================================

============================================================
RESEARCH QUESTION 1
============================================================
Can we predict accident severity from environmental/temporal/road conditions?

YES. Our models successfully predict accident severity.
Logistic Regression: 52.48% accuracy, 68.42% recall
Random Forest: 54.91% accuracy, 79.22% recall

Random Forest detects 238,311 out of 300,809 severe accidents.
This proves environmental, temporal, and road features can predict severity.
============================================================

============================================================
RESEARCH QUESTION 2
============================================================
Which features are most important for predicting severe accidents?

Feature Importance by Category:
  Time of Day: 44.0%
  Road Features: 33.0%
  Temperature: 13.0%
  Weather Conditions: 5.0%
  Visibility: 4.0%
  Precipitation: 2.0%

Top 5 Individual Features:
  1. Hour (24.0%)
  2. Month (20.0%)
  3. Crossing (14.0%)
  4. Temperature (13.0%)
  5. Traffic Signal (12.0%)

Key Finding: Time of day is the most important factor, not weather.
============================================================

================================================================================
MODEL COMPARISON & RECOMMENDATIONS
================================================================================

============================================================
MODEL PERFORMANCE COMPARISON
============================================================
              Model  Accuracy  Precision  Recall  F1-Score
Logistic Regression      0.52       0.24    0.68      0.36
      Random Forest      0.55       0.27    0.79      0.41

============================================================
RANDOM FOREST IMPROVEMENTS OVER BASELINE
============================================================
Accuracy:  +2.43 percentage points
Precision: +2.96 percentage points
Recall:    +10.80 percentage points
F1-Score:  +4.70 percentage points
Reduction in missed severe accidents: 34.2%

============================================================
BEST MODELS BY METRIC
============================================================
Best Accuracy: Random Forest (0.5491)
Best Recall: Random Forest (0.7922)
Best F1-Score: Random Forest (0.4061)

============================================================
RESEARCH QUESTION 3
============================================================
How do different ML models compare in performance?

Random Forest outperforms Logistic Regression across all metrics.
The biggest improvement is in recall: 68.4% to 79.2%.
Random Forest detects 32,492 more severe accidents.
This is a 34.2% reduction in missed severe accidents.

Random Forest works better because it captures complex interactions
between features like time of day and road conditions that linear models miss.
============================================================

============================================================
MODEL RECOMMENDATION
============================================================
Recommended Model: Random Forest

Reason: Best recall at 79.22% means it catches most severe accidents.
Accuracy: 54.91%
F1-Score: 0.4061

This model can be used for emergency response planning and driver safety alerts.
============================================================

# ==== VISUALIZATION: Model Performance Comparison ====
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.35

plt.figure(figsize=(7, 4))
plt.bar(x - width/2, comparison_df.loc[0, metrics], width, label='Logistic Regression')
plt.bar(x + width/2, comparison_df.loc[1, metrics], width, label='Random Forest')

plt.xticks(x, metrics)
plt.ylabel('Score')
plt.ylim(0, 1.0)
plt.title('Model Performance Comparison')
plt.legend()
plt.tight_layout()
plt.show()

Name	Parts Completed	Contribution Summary
Thamer Alhussain	Parts A, B, C, F	Found and curated dataset for the project, completed parts of EDA, wrote the conclusion and insights, and helped with formatting and adding contributors & headers for the final tutorial.
Gavin Harmon	Parts B, C, D, E	Completed data pre-processing, parts of EDA, the machine learning and model selection sections, and pushed all members' contributions to GitHub.
Sohan Challa	Parts A, D, E	Wrote the initial problem statement and completed the main statistical analysis section of the final tutorial.
Kevin Qi	Parts A, C, G	Wrote the introduction, data curation, pre-processing sections, parts of EDA, and conclusions for the EDA section.
Yash Aggarwal	Parts B, F, G	Created the GitHub repository and produced visualizations for most sections of the final tutorial.