# Simple classification example: the iris dataset

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
# import some data to play with
# columns: Sepal Length, Sepal Width, Petal Length and Petal Width
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
# just to create a nice table
df = pd.DataFrame({"Sepal Length (cm)": X[:,0], "Sepal Width (cm)": X[:,1], 
                   'Petal Length (cm)': X[:,2], 'Petal Width (cm)': X[:,3], 
                   'category': y})
df.head()

In [None]:
list(iris.target_names)

In [None]:
# split data into training and test data sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
# random_state with a number creates a reproduceable random number series  

In [None]:
# plot with color code
plt.subplots(1, 2, figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.subplot(1, 2, 2)
plt.scatter(X[:, 2], X[:, 3], c=y, edgecolor='k')
plt.xlabel('Petal length')
plt.ylabel('Petal width')

## Softmax regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(multi_class='multinomial', penalty='l2')
# penalty='none'`has been deprecated in 1.2 and will be removed in 1.4
log_reg.fit(x_train, y_train);

## k-nearest neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kn_neigh = KNeighborsClassifier(n_neighbors=5)
kn_neigh.fit(x_train, y_train);

## Fisher linear discriminant

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
fisher_ld = LinearDiscriminantAnalysis()
fisher_ld.fit(x_train, y_train);

## Classification accuracy

In [None]:
for clf in [log_reg, kn_neigh, fisher_ld]:
    y_pred = clf.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    print(type(clf).__name__)
    print(f"accuracy: {acc:0.2f}")
    
    # confusion matrix: columns: true class, row: predicted class
    print(confusion_matrix(y_test, y_pred),"\n")

In [None]:
y_pred = log_reg.predict(x_test)
print(classification_report(y_test, y_pred))
# The "support" column in the classification_report provides 
# information about the distribution of classes in the test set,
# showing how many instances of each class were present in the test data.

# F1-score provides a single value that balances both precision and recall,
# making it a useful metric for evaluating the overall performance 
# of a classification model