Code
```
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
# 1) Load
df = pd.read_csv("vehicles.csv", low_memory=False)
# 2) Keep needed columns and coerce numeric
df = df[["displ", "highway08", "fuelType"]].copy()
df["displ"] = pd.to_numeric(df["displ"], errors="coerce")
df["highway08"] = pd.to_numeric(df["highway08"], errors="coerce")
# 3) Normalize fuel labels and restrict to Gasoline vs Diesel
def norm_fuel(s):
s = str(s).lower()
if "diesel" in s: return "Diesel"
if "gasoline" in s or "premium" in s or "regular" in s: return "Gasoline"
return np.nan
df["fuel2"] = df["fuelType"].apply(norm_fuel)
df = df.dropna(subset=["displ", "highway08", "fuel2"])
df = df[df["fuel2"].isin(["Gasoline", "Diesel"])].copy()
# 4) Center X (optional but helpful)
df["displ_c"] = df["displ"] - df["displ"].mean()
# 5) OLS with interaction (the formula must be a STRING)
# Baseline is 'Diesel' or 'Gasoline' depending on category order;
# to force baseline, you can re-order the category levels as needed.
df["fuel2"] = pd.Categorical(df["fuel2"], categories=["Gasoline", "Diesel"])
model = smf.ols(formula="highway08 ~ displ_c * C(fuel2)", data=df).fit()
print(model.summary())
# 6) Simple slopes (MPG change per +1L displacement) for each fuel type
b = model.params
# With 'Gasoline' as baseline:
slope_gasoline = b["displ_c"]
slope_diesel = b["displ_c"] + b.get("displ_c:C(fuel2)[T.Diesel]", 0.0)
print("\nSimple slopes (MPG change per +1L displacement):")
print(f"Gasoline: {slope_gasoline:.3f} MPG per liter")
print(f"Diesel: {slope_diesel:.3f} MPG per liter") ```











