A line‑by‑line walkthrough of the original Jupyter notebook,
rebuilt as a portable web presentation (HTML + JS).
# project_root/
├── data/
│ ├── Copernicus2020_2024.csv
│ ├── pm10_SiSaire2020_2024.csv
│ ├── pm25_SiSaire2020_2024.csv
│ └── duaod550_2020_2024.nc
├── src/
│ └── pipeline.py
└── notebooks/
└── exploration.ipynb (optional)
All scripts read ../data/* from src/; nothing is mounted or
uploaded to external services.
import pandas as pd, xarray as xr
cop = pd.read_csv("../data/Copernicus2020_2024.csv",
parse_dates=["valid_time"])
sis10 = pd.read_csv("../data/pm10_SiSaire2020_2024.csv",
parse_dates=["timestamp"])
sis25 = pd.read_csv("../data/pm25_SiSaire2020_2024.csv",
parse_dates=["timestamp"])
dust = (xr.open_dataset("../data/duaod550_2020_2024.nc")
.to_dataframe()
.reset_index())
Key point: local paths keep the project self‑hosted; xarray handles NetCDF.
def daily_mean(df, time_col, cols):
return (df.assign(date=lambda d: d[time_col].dt.floor("D"))
.groupby("date")[cols].mean())
cop_daily = daily_mean(cop, "valid_time", ["pm10","pm2p5"])
sis10_daily = daily_mean(sis10, "timestamp", ["PM10"])
dust_daily = daily_mean(dust, "time", ["duaod550"])
Each dataset collapses to one row per day, aligning everything on the calendar.
merged = (
cop_daily
.merge(sis10_daily, left_index=True, right_index=True, how="outer")
.merge(dust_daily, left_index=True, right_index=True, how="outer")
.dropna() # keeps fully‑populated days only
)
The result is the modelling table: one date, five columns (pm₂·₅, pm₁₀, dust AOD…).
def add_lags(df, col, lags=4):
for k in range(1, lags+1):
df[f"{col}_lag{k}"] = df[col].shift(k)
return df
def train_test_split(df, test_size=0.2):
split_at = int(len(df) * (1 - test_size))
return df.iloc[:split_at], df.iloc[split_at:]
Lags give the models “memory”; time‑ordered split avoids look‑ahead bias.
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
df_lag = add_lags(merged.copy(), "PM10")
df_lag.dropna(inplace=True)
X = df_lag.filter(like="_lag").values
y = df_lag["duaod550"].values
train_X, test_X = train_test_split(X)
train_y, test_y = train_test_split(y)
model = Ridge(alpha=1.0).fit(train_X, train_y)
print("Test RMSE:", mean_squared_error(test_y, model.predict(test_X), squared=False))
Shows whether a linear + lagged relationship exists between ground PM and dust AOD.
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
rf = RandomForestRegressor(n_estimators=300, max_depth=12,
min_samples_leaf=3, random_state=0)
rf.fit(train_X, train_y)
xgb_model = xgb.XGBRegressor(n_estimators=600, learning_rate=0.05,
max_depth=8, subsample=0.9, colsample_bytree=0.8)
xgb_model.fit(train_X, train_y)
Captures non‑linearities & interactions that Ridge cannot represent.
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import numpy as np
SEQ = 30 # lookback window (days)
scaler = MinMaxScaler()
pm10_scaled = scaler.fit_transform(merged["PM10"].values.reshape(-1,1))
def make_sequences(series, steps=SEQ):
X, y = [], []
for i in range(len(series)-steps):
X.append(series[i:i+steps])
y.append(series[i+steps])
return np.array(X), np.array(y)
X_seq, y_seq = make_sequences(pm10_scaled)
train_X, test_X = train_test_split(X_seq)
train_y, test_y = train_test_split(y_seq)
net = Sequential([
LSTM(64, return_sequences=True, input_shape=(SEQ,1)),
LSTM(32),
Dense(1)
])
net.compile(optimizer='adam', loss='mse')
net.fit(train_X, train_y, epochs=30, batch_size=32)
LSTM treats PM10 as a sequence, learning temporal dynamics beyond fixed lags.
import matplotlib.pyplot as plt
def plot_results(true, pred, label):
plt.figure(figsize=(9,4))
plt.plot(true, linewidth=1.2, label="Truth")
plt.plot(pred, linewidth=1.2, linestyle="--", label=label)
plt.axvline(len(train_y), linestyle=":", label="train/test split")
plt.legend(); plt.tight_layout(); plt.show()
plot_results(np.concatenate([train_y, test_y]),
np.concatenate([model.predict(train_X), model.predict(test_X)]),
"Ridge pred")
Every model generates similar plots & metrics for apples‑to‑apples comparison.
sklearn.model_selection.GridSearchCV.dust.shift(-n)).Fork & adapt the pipeline for any city or pollutant — it is 100 % local‐file based, Google‑agnostic, and stays under your control.