-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathgaFeatureSelectionExample.py
196 lines (143 loc) · 7.08 KB
/
gaFeatureSelectionExample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from deap import creator, base, tools, algorithms
from scoop import futures
import random
import numpy
from scipy import interpolate
import matplotlib.pyplot as plt
# Read in data from CSV
# Data set from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
dfData = pd.read_csv('bank-additional-full.csv', sep=';')
# Encode the classification labels to numbers
# Get classes and one hot encoded feature vectors
le = LabelEncoder()
le.fit(dfData['y'])
allClasses = le.transform(dfData['y'])
allFeatures = dfData.drop(['y'], axis=1)
# Form training, test, and validation sets
X_trainAndTest, X_validation, y_trainAndTest, y_validation = train_test_split(allFeatures, allClasses, test_size=0.20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_trainAndTest, y_trainAndTest, test_size=0.20, random_state=42)
# Feature subset fitness function
def getFitness(individual, X_train, X_test, y_train, y_test):
# Parse our feature columns that we don't use
# Apply one hot encoding to the features
cols = [index for index in range(len(individual)) if individual[index] == 0]
X_trainParsed = X_train.drop(X_train.columns[cols], axis=1)
X_trainOhFeatures = pd.get_dummies(X_trainParsed)
X_testParsed = X_test.drop(X_test.columns[cols], axis=1)
X_testOhFeatures = pd.get_dummies(X_testParsed)
# Remove any columns that aren't in both the training and test sets
sharedFeatures = set(X_trainOhFeatures.columns) & set(X_testOhFeatures.columns)
removeFromTrain = set(X_trainOhFeatures.columns) - sharedFeatures
removeFromTest = set(X_testOhFeatures.columns) - sharedFeatures
X_trainOhFeatures = X_trainOhFeatures.drop(list(removeFromTrain), axis=1)
X_testOhFeatures = X_testOhFeatures.drop(list(removeFromTest), axis=1)
# Apply logistic regression on the data, and calculate accuracy
clf = LogisticRegression()
clf.fit(X_trainOhFeatures, y_train)
predictions = clf.predict(X_testOhFeatures)
accuracy = accuracy_score(y_test, predictions)
# Return calculated accuracy as fitness
return (accuracy,)
#========DEAP GLOBAL VARIABLES (viewable by SCOOP)========
# Create Individual
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
# Create Toolbox
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, len(dfData.columns) - 1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
# Continue filling toolbox...
toolbox.register("evaluate", getFitness, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
#========
def getHof():
# Initialize variables to use eaSimple
numPop = 100
numGen = 10
pop = toolbox.population(n=numPop)
hof = tools.HallOfFame(numPop * numGen)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", numpy.mean)
stats.register("std", numpy.std)
stats.register("min", numpy.min)
stats.register("max", numpy.max)
# Launch genetic algorithm
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=numGen, stats=stats, halloffame=hof, verbose=True)
# Return the hall of fame
return hof
def getMetrics(hof):
# Get list of percentiles in the hall of fame
percentileList = [i / (len(hof) - 1) for i in range(len(hof))]
# Gather fitness data from each percentile
testAccuracyList = []
validationAccuracyList = []
individualList = []
for individual in hof:
testAccuracy = individual.fitness.values
validationAccuracy = getFitness(individual, X_trainAndTest, X_validation, y_trainAndTest, y_validation)
testAccuracyList.append(testAccuracy[0])
validationAccuracyList.append(validationAccuracy[0])
individualList.append(individual)
testAccuracyList.reverse()
validationAccuracyList.reverse()
return testAccuracyList, validationAccuracyList, individualList, percentileList
if __name__ == '__main__':
'''
First, we will apply logistic regression using all the features to acquire a baseline accuracy.
'''
individual = [1 for i in range(len(allFeatures.columns))]
testAccuracy = getFitness(individual, X_train, X_test, y_train, y_test)
validationAccuracy = getFitness(individual, X_trainAndTest, X_validation, y_trainAndTest, y_validation)
print('\nTest accuracy with all features: \t' + str(testAccuracy[0]))
print('Validation accuracy with all features: \t' + str(validationAccuracy[0]) + '\n')
'''
Now, we will apply a genetic algorithm to choose a subset of features that gives a better accuracy than the baseline.
'''
hof = getHof()
testAccuracyList, validationAccuracyList, individualList, percentileList = getMetrics(hof)
# Get a list of subsets that performed best on validation data
maxValAccSubsetIndicies = [index for index in range(len(validationAccuracyList)) if validationAccuracyList[index] == max(validationAccuracyList)]
maxValIndividuals = [individualList[index] for index in maxValAccSubsetIndicies]
maxValSubsets = [[list(allFeatures)[index] for index in range(len(individual)) if individual[index] == 1] for individual in maxValIndividuals]
print('\n---Optimal Feature Subset(s)---\n')
for index in range(len(maxValAccSubsetIndicies)):
print('Percentile: \t\t\t' + str(percentileList[maxValAccSubsetIndicies[index]]))
print('Validation Accuracy: \t\t' + str(validationAccuracyList[maxValAccSubsetIndicies[index]]))
print('Individual: \t' + str(maxValIndividuals[index]))
print('Number Features In Subset: \t' + str(len(maxValSubsets[index])))
print('Feature Subset: ' + str(maxValSubsets[index]))
'''
Now, we plot the test and validation classification accuracy to see how these numbers change as we move from our worst feature subsets to the
best feature subsets found by the genetic algorithm.
'''
# Calculate best fit line for validation classification accuracy (non-linear)
tck = interpolate.splrep(percentileList, validationAccuracyList, s=5.0)
ynew = interpolate.splev(percentileList, tck)
e = plt.figure(1)
plt.plot(percentileList, validationAccuracyList, marker='o', color='r')
plt.plot(percentileList, ynew, color='b')
plt.title('Validation Set Classification Accuracy vs. \n Continuum with Cubic-Spline Interpolation')
plt.xlabel('Population Ordered By Increasing Test Set Accuracy')
plt.ylabel('Validation Set Accuracy')
e.show()
f = plt.figure(2)
plt.scatter(percentileList, validationAccuracyList)
plt.title('Validation Set Classification Accuracy vs. Continuum')
plt.xlabel('Population Ordered By Increasing Test Set Accuracy')
plt.ylabel('Validation Set Accuracy')
f.show()
g = plt.figure(3)
plt.scatter(percentileList, testAccuracyList)
plt.title('Test Set Classification Accuracy vs. Continuum')
plt.xlabel('Population Ordered By Increasing Test Set Accuracy')
plt.ylabel('Test Set Accuracy')
g.show()
input()