Arboles de desición

Proporcionan una represetancion grafica del proceso de desiscion secuencial

  • Puntos de desición y alternativas

#https://www.codificandobits.com/blog/clasificacion-arboles-decision-algoritmo-cart/
from sklearn.datasets import make_classification
X0, y0 = make_classification(n_samples=20,
    n_features=2, n_redundant=0, n_informative=1, random_state=1, 
    n_clusters_per_class=1)

x=np.arange(-2, 3,1)
x_,y_ = np.meshgrid(x,x)

fig, ax = plt.subplots()

ax.plot(X0[:,0][y0==0],X0[:,1][y0==0],"ro", alpha=0.5)
ax.plot(X0[:,0][y0==1],X0[:,1][y0==1],"bo", alpha=0.5)
ax.plot(x_,y_,"k.")

ax.axvline(x = 1, color = 'm', label="x1<=1")
ax.axhline(y = 0.2, color = 'y', label="x2<0.2")
ax.axvline(x = 0.0, color = 'k', label="x2<0.2")

plt.legend(loc=3)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [1], in <cell line: 7>()
      2 from sklearn.datasets import make_classification
      3 X0, y0 = make_classification(n_samples=20,
      4     n_features=2, n_redundant=0, n_informative=1, random_state=1, 
      5     n_clusters_per_class=1)
----> 7 x=np.arange(-2, 3,1)
      8 x_,y_ = np.meshgrid(x,x)
     10 fig, ax = plt.subplots()

NameError: name 'np' is not defined
  • ref https://www.youtube.com/watch?v=kqaLlte6P6o&ab_channel=CodificandoBits

  • Hand on Machine leargning

  • Gini index

$G=1- (PC 1)^2-(PC 2)^2$

-Si se tiene puntos perteniecientes a una clase despues de trazar la primera se tienen nodos puros. $G=0$ Datos de una sola categoria

  • Gini > 0 son datos con impurezas

La particion purpura y amarilla del grafico anterior represetan nodos puros.

Mientras que la linea negra define: Si : $G=1-(7/12)^{2}-(5/12)^{2}$

mientras que la ponderacion viene dada por: m = (7/20+6/20)

No:

$G=1-(3/7)^{2}-(4/7)^{2}$ m = (3/20+4/20)

La funcion de coste, asocida al nodo padre:

$J(k, t_k) = \frac{m_{left}}{m} G_{left} + \frac{m_{right}}{m} G_{right} $

Entropía de Gini.

$H=-\sum_{k=1\ p_{ik!=0}}^{n} p_{ij}\log_2(p_{ik})$

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
import graphviz
#from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree


# Libraries for draw contours
def make_meshgrid(x, y, h=0.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


def plot_contoursExact(ax, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,\
                                                    test_size=0.33, random_state=42)
DecisionTreeClassifier?
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train)

print(f"{clf.score(X_test, y_test)} ")
print(f"{clf.score(X_train, y_train)}" )
0.98 
0.95
tree.export_graphviz?
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html
# save graph, put out_file with the name
arbol = tree.export_graphviz(clf, out_file=None, 
                     class_names = iris.target_names,\
                    feature_names = iris.feature_names,\
                    impurity=False, filled=True)
!ls
sample_data
tree.plot_tree?
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
graph=tree.plot_tree(clf,class_names = iris.target_names,\
              feature_names = iris.feature_names,\
              impurity=False, filled=True,rounded=True )
../_images/Sesion_12_Desicion_Tree_10_0.png

Importancia de las cacracteristicas

clf.feature_importances_
array([0.        , 0.        , 0.55816894, 0.44183106])
caract = iris.data.shape[1]
plt.barh(range(caract), clf.feature_importances_)
plt.yticks(np.arange(caract),iris.feature_names)
plt.xlabel('Importancia de las características')
plt.ylabel('Características')
plt.show()
../_images/Sesion_12_Desicion_Tree_13_0.png
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# Parameters
n_classes = 3
plot_colors = "bry"
plot_step = 0.02

# Load data
iris = load_iris()

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
                                [1, 2], [1, 3], [2, 3]]):
    # We only take the two corresponding features
    X = iris.data[:, pair]
    y = iris.target

    # Shuffle
    idx = np.arange(X.shape[0])
    np.random.seed(13)
    np.random.shuffle(idx)
    X = X[idx]
    y = y[idx]

    # Standardize
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # Train
    clf = DecisionTreeClassifier().fit(X, y)

    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)
  
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])
    plt.axis("tight")

    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
                    cmap=plt.cm.Paired)

    plt.axis("tight")

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend()
plt.show()
../_images/Sesion_12_Desicion_Tree_14_0.png