Paquetes necesarios

In [1]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

Lectura de datos

In [2]:
df = datasets.load_iris()
#df = datasets.load_breast_cancer()

Aprendizaje supervisado

Naïve Bayes

In [3]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
y_pred = clf.fit(df.data, df.target).predict(df.data)
print(f'Precisión de la validación cruzada: {(cross_val_score(clf, df.data, df.target, cv=5)).mean():.2f}%')
cv_results = cross_validate(clf, df.data, df.target, cv=5)
scores = cross_validate(clf, df.data, df.target, cv=5, scoring=('accuracy'), return_train_score=True)
print(scores['test_score'])
cm = confusion_matrix(df.target, y_pred)
print(cm)
Precisión de la validación cruzada: 0.95%
[0.93333333 0.96666667 0.93333333 0.93333333 1.        ]
[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]

Support Vector Machine

In [4]:
from sklearn.svm import SVC
from sklearn import svm
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

df = datasets.load_breast_cancer()

clf = SVC(kernel='linear', gamma='auto')
y_pred = clf.fit(df.data, df.target).predict(df.data)
print(f'Precisión de la validación cruzada: {(cross_val_score(clf, df.data, df.target, cv=5)).mean():.2f}%')
cv_results = cross_validate(clf, df.data, df.target, cv=5)
scores = cross_validate(clf, df.data, df.target, cv=5, scoring=('accuracy', 'roc_auc'), return_train_score=True)
print(scores['test_accuracy'])
print(scores['test_roc_auc'])
cm = confusion_matrix(df.target, y_pred)
print(cm)
Precisión de la validación cruzada: 0.95%
[0.94782609 0.93043478 0.97345133 0.92035398 0.95575221]
[0.99095607 0.99321705 0.99798793 0.97585513 0.99731724]
[[201  11]
 [  8 349]]
In [5]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

# Importar iris
iris = datasets.load_iris()
X = iris.data
y = iris.target

# BBinarizar las salidas
y = label_binarize(y, classes=[0, 1,2 ])
n_classes = y.shape[1]

# Añadir ruido para hacer el problemá más complejo
random_state = np.random.RandomState(0)
#n_samples, n_features = X.shape
#X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# barajar y dividir conjunto de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                    random_state=random_state)

# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Calcular ROC y AUC para cada clase
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Calcular ROC y AUC micro-average
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

Lectura de Datos

In [6]:
#df = datasets.load_iris()
df = datasets.load_breast_cancer()

Decision Tree

In [7]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy')
y_pred = clf.fit(df.data, df.target).predict(df.data)
print(f'Precisión de la validación cruzada: {(cross_val_score(clf, df.data, df.target, cv=5)).mean():.2f}%')
Precisión de la validación cruzada: 0.93%
In [8]:
from matplotlib.pyplot import figure
figure(dpi=200)
tree_plot = tree.plot_tree(clf, feature_names=df['feature_names'])
In [9]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())
/opt/anaconda3/lib/python3.7/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", DeprecationWarning)
Out[9]:

Aprendizaje perezoso

KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

clf = KNeighborsClassifier(n_neighbors=3)
y_pred = clf.fit(df.data, df.target).predict(df.data)
print(f'Precisión de la validación cruzada: {(cross_val_score(clf, df.data, df.target, cv=5)).mean():.2f}%')
cv_results = cross_validate(clf, df.data, df.target, cv=5)
scores = cross_validate(clf, df.data, df.target, cv=5, scoring=('accuracy'), return_train_score=True)
print(scores['test_score'])
cm = confusion_matrix(df.target, y_pred)
print(cm)

#Vamos a hacer un recorrido desde k=1 a k=25
X = df.data
y = df.target

# Añadir ruido para hacer el problema más complejo
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 2*n_features)]

# barajar y dividir conjunto de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                    random_state=4)

k_range = range(1,26)
scores = {}
scores_list = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    scores[k] = metrics.accuracy_score(y_test,y_pred)
    scores_list.append(scores[k])

plt.plot(k_range,scores_list)
plt.xlabel('Valor de k para kNN')
plt.ylabel('Test accuracy')
Precisión de la validación cruzada: 0.92%
[0.87826087 0.92173913 0.94690265 0.9380531  0.91150442]
[[193  19]
 [  6 351]]
Out[10]:
Text(0, 0.5, 'Test accuracy')

Ensembles

Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
clf = RandomForestClassifier(n_estimators=100)
y_pred = clf.fit(df.data, df.target).predict(df.data)
print(f'Precisión de la validación cruzada: {(cross_val_score(clf, df.data, df.target, cv=5)).mean():.2f}%')
cv_results = cross_validate(clf, df.data, df.target, cv=5)
scores = cross_validate(clf, df.data, df.target, cv=5, scoring=('accuracy'), return_train_score=True)
print(scores['test_score'])
cm = confusion_matrix(df.target, y_pred)
print(cm)

#Extraemos el árbol número 5
concrete_tree = clf.estimators_[5]
dot_data = StringIO()
export_graphviz(concrete_tree, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())
Precisión de la validación cruzada: 0.96%
[0.93913043 0.94782609 0.98230088 0.96460177 0.96460177]
[[212   0]
 [  0 357]]
Out[11]:

XGBoost

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib.pyplot import figure
clf = GradientBoostingClassifier()
y_pred = clf.fit(df.data, df.target).predict(df.data)
print(f'Precisión de la validación cruzada: {(cross_val_score(clf, df.data, df.target, cv=5)).mean():.2f}%')
cv_results = cross_validate(clf, df.data, df.target, cv=5)
scores = cross_validate(clf, df.data, df.target, cv=5, scoring=('accuracy'), return_train_score=True)
print(scores['test_score'])
cm = confusion_matrix(df.target, y_pred)
print(cm)
Precisión de la validación cruzada: 0.96%
[0.93913043 0.93913043 0.97345133 0.98230088 0.98230088]
[[212   0]
 [  0 357]]
In [13]:
from xgboost import XGBClassifier
from xgboost import plot_tree
import matplotlib.pyplot as plt

# load data
X = df.data
y = df.target

# ajusta
model = XGBClassifier()
model.fit(X, y)
# dibuja el primer árbol horizontal
#plot_tree(model, rankdir='LR')

#dibuja el quinto árbol en vertical
plot_tree(model,num_trees=4)
fig = plt.gcf()
fig.set_size_inches(150, 100)
fig.savefig('tree.png')
plt.show()

Ada Boost

In [14]:
from sklearn.ensemble import AdaBoostClassifier
clf_base = tree.DecisionTreeClassifier()
clf = AdaBoostClassifier(base_estimator=clf_base)
y_pred = clf.fit(df.data, df.target).predict(df.data)
print(f'Precisión de la validación cruzada: {(cross_val_score(clf, df.data, df.target, cv=5)).mean():.2f}%')
cv_results = cross_validate(clf, df.data, df.target, cv=5)
scores = cross_validate(clf, df.data, df.target, cv=5, scoring=('accuracy'), return_train_score=True)
print(scores['test_score'])
cm = confusion_matrix(df.target, y_pred)
print(cm)
Precisión de la validación cruzada: 0.92%
[0.91304348 0.90434783 0.91150442 0.94690265 0.90265487]
[[212   0]
 [  0 357]]