# Hands-on: Study feature importance in the MAGIC Cherenkov telescope sample

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
# read data
filename = "magic04_data.txt"
# filename = "https://www.physi.uni-heidelberg.de/~marks/ml_einfuehrung/Beispiele/magic04_data.txt"
df = pd.read_csv(filename, engine='python')

# relabel: gamma shower (g) --> 1 (signal), hadron shower (h) --> 0 (background) 
df['class'] = df['class'].map({'g': 1, 'h': 0})

In [None]:
# y = value to predict, X = features
y = df['class'].values
X = df[[col for col in df.columns if col!="class"]]

In [None]:
# generate training and test samples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

In [None]:
# train the normal xgb classifier with all features
import xgboost as xgb
from sklearn.metrics import roc_auc_score

XGBclassifier = xgb.sklearn.XGBClassifier(nthread=-1, seed=1, n_estimators=1000)
XGBclassifier.fit(X_train, y_train)
y_pred_xgb = XGBclassifier.predict(X_test)
auc = roc_auc_score(y_test, y_pred_xgb)
print(f'The AUC we get training with all features is {auc}')

a) Now try to train classifiers with $n-1$ features and recalculate the AUC

In [None]:
auc_dropfeat = {}
for feat in X.columns:
    feats = X.columns.drop(feat)
    clf = xgb.sklearn.XGBClassifier(nthread=-1, seed=1, n_estimators=1000)
    clf.fit(X_train[feats], y_train)
    y_pred_xgb = clf.predict(X_test[feats])
    auc_dropfeat[feat] = roc_auc_score(y_test, y_pred_xgb)
    print(f'Dropping feature {feat} we get an AUC of {auc_dropfeat[feat]}')

In [None]:
# some nice visualisation of the gain in AUC
feats = X.columns
auc_gain = [(auc - auc_dropfeat[feat])/auc for feat in feats]
plt.barh(feats, auc_gain);

b) How does this compare to the provided `plot_importance` function from XGBoost (see [XGBoost plotting API](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.plotting)). Do you get the same answer for all three performance measures provided by XGBoost (“weight”, “gain”, or “cover”)?

In [None]:
xgb.plot_importance(XGBclassifier, ax=plt.gca(), importance_type='gain')