Arboles de desición¶

Proporcionan una represetancion grafica del proceso de desiscion secuencial

Puntos de desición y alternativas

#https://www.codificandobits.com/blog/clasificacion-arboles-decision-algoritmo-cart/
from sklearn.datasets import make_classification
X0, y0 = make_classification(n_samples=20,
    n_features=2, n_redundant=0, n_informative=1, random_state=1, 
    n_clusters_per_class=1)

x=np.arange(-2, 3,1)
x_,y_ = np.meshgrid(x,x)

fig, ax = plt.subplots()

ax.plot(X0[:,0][y0==0],X0[:,1][y0==0],"ro", alpha=0.5)
ax.plot(X0[:,0][y0==1],X0[:,1][y0==1],"bo", alpha=0.5)
ax.plot(x_,y_,"k.")

ax.axvline(x = 1, color = 'm', label="x1<=1")
ax.axhline(y = 0.2, color = 'y', label="x2<0.2")
ax.axvline(x = 0.0, color = 'k', label="x2<0.2")

plt.legend(loc=3)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [1], in <cell line: 7>()
      2 from sklearn.datasets import make_classification
      3 X0, y0 = make_classification(n_samples=20,
      4     n_features=2, n_redundant=0, n_informative=1, random_state=1, 
      5     n_clusters_per_class=1)
----> 7 x=np.arange(-2, 3,1)
      8 x_,y_ = np.meshgrid(x,x)
     10 fig, ax = plt.subplots()

NameError: name 'np' is not defined

ref https://www.youtube.com/watch?v=kqaLlte6P6o&ab_channel=CodificandoBits
Hand on Machine leargning
Gini index

$G=1- (PC 1)^2-(PC 2)^2$

-Si se tiene puntos perteniecientes a una clase despues de trazar la primera se tienen nodos puros. $G=0$ Datos de una sola categoria

Gini > 0 son datos con impurezas

La particion purpura y amarilla del grafico anterior represetan nodos puros.

Mientras que la linea negra define: Si : $G=1-(7/12)^{2}-(5/12)^{2}$

mientras que la ponderacion viene dada por: m = (7/20+6/20)

No:

$G=1-(3/7)^{2}-(4/7)^{2}$ m = (3/20+4/20)

La funcion de coste, asocida al nodo padre:

$J(k, t_k) = \frac{m_{left}}{m} G_{left} + \frac{m_{right}}{m} G_{right} $

Entropía de Gini.

$H=-\sum_{k=1\ p_{ik!=0}}^{n} p_{ij}\log_2(p_{ik})$

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
import graphviz
#from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree


# Libraries for draw contours
def make_meshgrid(x, y, h=0.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


def plot_contoursExact(ax, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,\
                                                    test_size=0.33, random_state=42)

DecisionTreeClassifier?

clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train)

print(f"{clf.score(X_test, y_test)} ")
print(f"{clf.score(X_train, y_train)}" )

0.98 
0.95

tree.export_graphviz?
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html

# save graph, put out_file with the name
arbol = tree.export_graphviz(clf, out_file=None, 
                     class_names = iris.target_names,\
                    feature_names = iris.feature_names,\
                    impurity=False, filled=True)
!ls

sample_data

tree.plot_tree?

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
graph=tree.plot_tree(clf,class_names = iris.target_names,\
              feature_names = iris.feature_names,\
              impurity=False, filled=True,rounded=True )

../_images/Sesion_12_Desicion_Tree_10_0.png

Aprendizaje Estatadistico

Arboles de desición

Contents

Arboles de desición¶

Importancia de las cacracteristicas¶