📜  使用Python从头开始网格搜索

📅  最后修改于: 2022-05-13 01:55:19.946000             🧑  作者: Mango




网格搜索可以应用于任何可以通过调整超参数来提高性能的超参数算法。例如,我们可以通过在 K 近邻的一组 K 值上验证其性能来对 K 近邻应用网格搜索。我们可以对逻辑回归做同样的事情,通过使用一组学习率值来找到逻辑回归达到最佳准确率的最佳学习率。


它有 8 个特征列,如“年龄” 、“葡萄糖”等,以及 108 名患者的目标变量“结果”。因此,在此,我们将训练一个 Logistic 回归分类器模型来预测具有此类信息的患者是否存在糖尿病。


# Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# Grid Searching in Logistic Regression
class LogitRegression() :
    def __init__( self, learning_rate, iterations ) :        
        self.learning_rate = learning_rate        
        self.iterations = iterations
    # Function for model training            
    def fit( self, X, Y ) :        
        # no_of_training_examples, no_of_features        
        self.m, self.n = X.shape
        # weight initialization        
        self.W = np.zeros( self.n )        
        self.b = 0        
        self.X = X        
        self.Y = Y
        # gradient descent learning                
        for i in range( self.iterations ) :            
        return self
    # Helper function to update weights in gradient descent    
    def update_weights( self ) :           
        A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
        # calculate gradients        
        tmp = ( A - self.Y.T )        
        tmp = np.reshape( tmp, self.m )        
        dW = np.dot( self.X.T, tmp ) / self.m         
        db = np.sum( tmp ) / self.m 
        # update weights    
        self.W = self.W - self.learning_rate * dW    
        self.b = self.b - self.learning_rate * db        
        return self
    # Hypothetical function  h( x )     
    def predict( self, X ) :    
        Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )        
        Y = np.where( Z > 0.5, 1, 0 )        
        return Y
# Driver code
def main() :
    # Importing dataset    
    df = pd.read_csv( "diabetes.csv" )
    X = df.iloc[:,:-1].values
    Y = df.iloc[:,-1:].values
    # Splitting dataset into train and validation set
    X_train, X_valid, Y_train, Y_valid = train_test_split( 
      X, Y, test_size = 1/3, random_state = 0 )
    # Model training    
    max_accuracy = 0
    # learning_rate choices    
    learning_rates = [ 0.1, 0.2, 0.3, 0.4, 0.5, 
                      0.01, 0.02, 0.03, 0.04, 0.05 ]
    # iterations choices    
    iterations = [ 100, 200, 300, 400, 500 ]
    # available combination of learning_rate and iterations
    parameters = []    
    for i in learning_rates :        
        for j in iterations :            
            parameters.append( ( i, j ) )
    print("Available combinations : ",  parameters )
    # Applying linear searching in list of available combination
    # to achieved maximum accuracy on CV set
    for k in range( len( parameters ) ) :        
        model = LogitRegression( learning_rate = parameters[k][0], 
                                iterations = parameters[k][1] )
        model.fit( X_train, Y_train )
        # Prediction on validation set
        Y_pred = model.predict( X_valid )
        # measure performance on validation set
        correctly_classified = 0
        # counter    
        count = 0
        for count in range( np.size( Y_pred ) ) :            
            if Y_valid[count] == Y_pred[count] :                
                correctly_classified = correctly_classified + 1   
        curr_accuracy = ( correctly_classified / count ) * 100
        if max_accuracy < curr_accuracy :            
            max_accuracy = curr_accuracy
    print( "Maximum accuracy achieved by our model through grid searching : ", max_accuracy )
if __name__ == "__main__" :     

# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Driver Code
def main() :    
    # Importing dataset    
    df = pd.read_csv( "diabetes.csv" )
    X = df.iloc[:,:-1].values
    Y = df.iloc[:,-1:].values
    # Splitting dataset into train and test set
    X_train, X_test, Y_train, Y_test = train_test_split( 
      X, Y, test_size = 1/3, random_state = 0 )
    # Model training    
    max_accuracy = 0
    # grid searching for learning rate    
    parameters = { 'C' : [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] }
    model = LogisticRegression()        
    grid = GridSearchCV( model, parameters )    
    grid.fit( X_train, Y_train )
    # Prediction on test set
    Y_pred = grid.predict( X_test )
    # measure performance    
    correctly_classified = 0
    # counter    
    count = 0
    for count in range( np.size( Y_pred ) ) :            
        if Y_test[count] == Y_pred[count] :            
            correctly_classified = correctly_classified + 1   
    accuracy = ( correctly_classified / count ) * 100
    print( "Maximum accuracy achieved by sklearn model through grid searching : ", np.round( accuracy, 2 ) )
if __name__ == "__main__" :     


Available combinations :  [(0.1, 100), (0.1, 200), (0.1, 300), (0.1, 400), 
(0.1, 500), (0.2, 100), (0.2, 200), (0.2, 300), (0.2, 400), (0.2, 500), 
(0.3, 100), (0.3, 200), (0.3, 300), (0.3, 400), (0.3, 500), (0.4, 100), 
(0.4, 200), (0.4, 300), (0.4, 400), (0.4, 500), (0.5, 100), (0.5, 200), 
(0.5, 300), (0.5, 400), (0.5, 500), (0.01, 100), (0.01, 200), (0.01, 300),
(0.01, 400), (0.01, 500), (0.02, 100), (0.02, 200), (0.02, 300), (0.02, 400), 
(0.02, 500), (0.03, 100), (0.03, 200), (0.03, 300), (0.03, 400), (0.03, 500), 
(0.04, 100), (0.04, 200), (0.04, 300), (0.04, 400), (0.04, 500), (0.05, 100), 
(0.05, 200), (0.05, 300), (0.05, 400), (0.05, 500)]

Maximum accuracy achieved by our model through grid searching :  60.0


代码:在sklearn的Logistic Regression上实现Grid Search


# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Driver Code
def main() :    
    # Importing dataset    
    df = pd.read_csv( "diabetes.csv" )
    X = df.iloc[:,:-1].values
    Y = df.iloc[:,-1:].values
    # Splitting dataset into train and test set
    X_train, X_test, Y_train, Y_test = train_test_split( 
      X, Y, test_size = 1/3, random_state = 0 )
    # Model training    
    max_accuracy = 0
    # grid searching for learning rate    
    parameters = { 'C' : [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] }
    model = LogisticRegression()        
    grid = GridSearchCV( model, parameters )    
    grid.fit( X_train, Y_train )
    # Prediction on test set
    Y_pred = grid.predict( X_test )
    # measure performance    
    correctly_classified = 0
    # counter    
    count = 0
    for count in range( np.size( Y_pred ) ) :            
        if Y_test[count] == Y_pred[count] :            
            correctly_classified = correctly_classified + 1   
    accuracy = ( correctly_classified / count ) * 100
    print( "Maximum accuracy achieved by sklearn model through grid searching : ", np.round( accuracy, 2 ) )
if __name__ == "__main__" :     


Maximum accuracy achieved by sklearn model through grid searching :  62.86
