📜  用于机器学习的 7 个最佳 R 包

📅  最后修改于: 2021-10-22 02:55:52             🧑  作者: Mango

机器学习是人工智能的一个子集,专注于开发计算机软件或程序,这些软件或程序可以访问数据以进行自我学习并进行预测,即无需明确编程。机器学习由不同的子部分组成,即无监督学习、监督学习和强化学习。它定义了有助于构建模型的数值问题和分类问题。

7-Best-R-Packages-for-Machine-Learning-1

R语言用于统计分析和计算,被全世界大多数研究人员使用。 R 正被用于构建机器学习模型,因为它具有灵活性、高效的软件包以及执行与云集成的深度学习模型的能力。作为一种开源语言,所有包都发布在 R 上,并得到了世界各地程序员的贡献,以使其更加用户友好。以下R包在工业中广泛使用的是:

  • 数据表
  • dplyr
  • ggplot2
  • 插入符号
  • e1071
  • xgboost
  • 随机森林

数据表

data.table 提供了 R 的 data.frame 的高性能版本,具有功能增强和语法,易于使用、内存高效且功能丰富。它提供了一个快速友好的分隔文件阅读器和文件编写器。它是 Github 上评价最高的软件包之一。它提供低级并行性、具有功能丰富的连接的可扩展聚合以及功能丰富的重塑数据。

R
# Installing the package
install.packages("data.table")
  
# Loading package
library(data.table)
  
# Importing dataset
Gov_mortage <- fread("Government_Mortage.csv")
  
# Record with loan amount 114
# With country code 60
Answer <- Gov_mortage[loan_amount == 114 & 
                      county_code == 60]
Answer


R
# Installing the package
install.packages("dplyr")
  
# Loading package
library(dplyr)
  
# Importing dataset
Gov_mortage <- fread("Government_Mortage.csv")
  
# Select
select(Gov_mortage, state_code)
  
# Mutate
m <- mutate(Gov_mortage, 
            amount = loan_amount - applicant_income)
m
  
# Filter
f = filter(Gov_mortage, county_code == 80)
f
  
# Arrange
arrange(Gov_mortage, county_code == 80)
  
# Summarize
summarise(f, max_loan = max(loan_amount))


R
# Installing the package 
install.packages("dplyr") 
install.packages("ggplot2")
    
# Loading packages 
library(dplyr) 
library(ggplot2)
  
# Data Layer 
ggplot(data = mtcars) 
     
# Aesthetic Layer 
ggplot(data = mtcars, aes(x = hp, 
                          y = mpg,
                          col = disp)) 
     
# Geometric layer 
ggplot(data = mtcars,  
       aes(x = hp, y = mpg, 
           col = disp)) + geom_point()


R
# Installing Packages 
install.packages("e1071") 
install.packages("caTools") 
install.packages("caret") 
    
# Loading package 
library(e1071) 
library(caTools) 
library(caret) 
    
# Loading data 
data(iris) 
  
# Splitting data into train 
# and test data 
split <- sample.split(iris, SplitRatio = 0.7) 
train_cl <- subset(iris, split == "TRUE") 
test_cl <- subset(iris, split == "FALSE") 
    
# Feature Scaling 
train_scale <- scale(train_cl[, 1:4]) 
test_scale <- scale(test_cl[, 1:4]) 
    
# Fitting Naive Bayes Model  
# to training dataset 
set.seed(120)  # Setting Seed 
classifier_cl <- naiveBayes(Species ~ ., 
                            data = train_cl) 
classifier_cl 
    
# Predicting on test data' 
y_pred <- predict(classifier_cl,
                  newdata = test_cl) 
    
# Confusion Matrix 
cm <- table(test_cl$Species, y_pred) 
cm


R
# Installing Packages 
install.packages("e1071") 
install.packages("caTools") 
install.packages("class") 
    
# Loading package 
library(e1071) 
library(caTools) 
library(class) 
    
# Loading data 
data(iris) 
    
# Splitting data into train 
# and test data 
split <- sample.split(iris, SplitRatio = 0.7) 
train_cl <- subset(iris, split == "TRUE") 
test_cl <- subset(iris, split == "FALSE") 
    
# Feature Scaling 
train_scale <- scale(train_cl[, 1:4]) 
test_scale <- scale(test_cl[, 1:4]) 
    
# Fitting KNN Model  
# to training dataset 
classifier_knn <- knn(train = train_scale, 
                      test = test_scale, 
                      cl = train_cl$Species, 
                      k = 1) 
classifier_knn 
    
# Confusiin Matrix 
cm <- table(test_cl$Species, classifier_knn) 
cm 
    
# Model Evaluation - Choosing K 
# Calculate out of Sample error 
misClassError <- mean(classifier_knn != test_cl$Species) 
print(paste('Accuracy =', 1 - misClassError))


R
# Installing Packages 
install.packages("data.table") 
install.packages("dplyr") 
install.packages("ggplot2") 
install.packages("caret") 
install.packages("xgboost") 
install.packages("e1071") 
install.packages("cowplot") 
    
# Loading packages 
library(data.table) # for reading and manipulation of data 
library(dplyr)      # for data manipulation and joining 
library(ggplot2)    # for ploting  
library(caret)      # for modeling 
library(xgboost)    # for building XGBoost model 
library(e1071)      # for skewness 
library(cowplot)    # for combining multiple plots  
    
# Setting test dataset 
# Combining datasets 
# add Item_Outlet_Sales to test data 
test[, Item_Outlet_Sales := NA]  
combi = rbind(train, test) 
    
# Missing Value Treatment 
missing_index = which(is.na(combi$Item_Weight)) 
for(i in missing_index){ 
  item = combi$Item_Identifier[i] 
  combi$Item_Weight[i] = mean(combi$Item_Weight 
                         [combi$Item_Identifier == item],  
                         na.rm = T) 
} 
    
# Replacing 0 in Item_Visibility with mean 
zero_index = which(combi$Item_Visibility == 0) 
for(i in zero_index){ 
  item = combi$Item_Identifier[i] 
  combi$Item_Visibility[i] = mean( 
       combi$Item_Visibility[combi$Item_Identifier == item],  
       na.rm = T) 
} 
    
# Label Encoding 
# To convert categorical in numerical 
combi[, Outlet_Size_num :=  
           ifelse(Outlet_Size == "Small", 0, 
           ifelse(Outlet_Size == "Medium", 1, 2))] 
    
combi[, Outlet_Location_Type_num :=  
           ifelse(Outlet_Location_Type == "Tier 3", 0, 
           ifelse(Outlet_Location_Type == "Tier 2", 1, 2))] 
    
combi[, c("Outlet_Size", "Outlet_Location_Type") := NULL] 
    
# One Hot Encoding 
# To convert categorical in numerical 
ohe_1 = dummyVars("~.", 
        data = combi[, -c("Item_Identifier",  
                    "Outlet_Establishment_Year", 
                    "Item_Type")], fullRank = T) 
ohe_df = data.table(predict(ohe_1,  
         combi[, -c("Item_Identifier",  
         "Outlet_Establishment_Year", "Item_Type")])) 
    
combi = cbind(combi[, "Item_Identifier"], ohe_df) 
    
# Remove skewness 
skewness(combi$Item_Visibility)  
skewness(combi$price_per_unit_wt) 
    
# log + 1 to avoid division by zero 
combi[, Item_Visibility := log(Item_Visibility + 1)]  
    
# Scaling and Centering data 
# index of numeric features 
num_vars = which(sapply(combi, is.numeric))  
num_vars_names = names(num_vars) 
    
combi_numeric = combi[, setdiff(num_vars_names,  
                "Item_Outlet_Sales"), with = F] 
    
prep_num = preProcess(combi_numeric, 
                 method = c("center", "scale")) 
combi_numeric_norm = predict(prep_num, combi_numeric) 
    
# removing numeric independent variables 
combi[, setdiff(num_vars_names,  
               "Item_Outlet_Sales") := NULL]  
combi = cbind(combi,  
              combi_numeric_norm) 
    
# Splitting data back to train and test 
train = combi[1:nrow(train)] 
test = combi[(nrow(train) + 1):nrow(combi)] 
    
# Removing Item_Outlet_Sales 
test[, Item_Outlet_Sales := NULL]  
    
# Model Building: XGBoost 
param_list = list( 
  objective = "reg:linear", 
  eta = 0.01, 
  gamma = 1, 
  max_depth = 6, 
  subsample = 0.8, 
  colsample_bytree = 0.5 
) 
    
# Converting train and test into xgb.DMatrix format 
Dtrain = xgb.DMatrix( 
         data = as.matrix(train[, -c("Item_Identifier",  
                                "Item_Outlet_Sales")]),  
         label = train$Item_Outlet_Sales) 
Dtest = xgb.DMatrix( 
         data = as.matrix(test[, -c("Item_Identifier")])) 
    
# 5-fold cross-validation to  
# find optimal value of nrounds 
set.seed(112)  # Setting seed 
xgbcv = xgb.cv(params = param_list,  
               data = Dtrain,  
               nrounds = 1000,  
               nfold = 5,  
               print_every_n = 10,  
               early_stopping_rounds = 30,  
               maximize = F) 
    
# Training XGBoost model at nrounds = 428 
xgb_model = xgb.train(data = Dtrain,  
                      params = param_list,  
                      nrounds = 428) 
xgb_model


R
# Installing package 
  
# For sampling the dataset 
install.packages("caTools")
  
# For implementing random forest algorithm 
install.packages("randomForest")  
    
# Loading package 
library(caTools) 
library(randomForest) 
    
# Loading data 
data(iris) 
  
# Splitting data in train and test data 
split <- sample.split(iris, SplitRatio = 0.7) 
split 
    
train <- subset(iris, split == "TRUE") 
test <- subset(iris, split == "FALSE") 
    
# Fitting Random Forest to the train dataset 
set.seed(120)  # Setting seed 
classifier_RF = randomForest(x = train[-5], 
                             y = train$Species, 
                             ntree = 500) 
    
classifier_RF 
    
# Predicting the Test set results 
y_pred = predict(classifier_RF, newdata = test[-5]) 
    
# Confusion Matrix 
confusion_mtx = table(test[, 5], y_pred) 
confusion_mtx


输出:

row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md
1:     65         1             1            3         1         114           3    344
2:   1285         1             1            1         1         114           2     -1
3:   6748         1             1            3         1         114           3    309
4:  31396         1             1            1         1         114           1    333
5:  70311         1             1            1         1         114           3    309
6: 215535         1             1            3         1         114           3    365
7: 217264         1             1            1         2         114           3    333
8: 301947         1             1            3         1         114           3     48
  state_code county_code applicant_ethnicity applicant_race applicant_sex
1:          9          60                   2              5             1
2:         25          60                   2              5             2
3:         47          60                   2              5             2
4:          6          60                   2              5             1
5:         47          60                   2              5             2
6:         21          60                   2              5             1
7:          6          60                   2              5             1
8:         14          60                   2              5             2
  applicant_income population minority_population_pct ffiecmedian_family_income
1:               68       6355                  14.844                     61840
2:               57       1538                   2.734                     58558
3:               54       4084                   5.329                     76241
4:              116       5445                  41.429                     70519
5:               50       5214                   3.141                     74094
6:               57       6951                   4.219                     56341
7:               37       2416                  18.382                     70031
8:               35       3159                   8.533                     56335
  tract_to_msa_md_income_pct number_of_owner-occupied_units
1:                    100.000                           1493
2:                    100.000                            561
3:                    100.000                           1359
4:                    100.000                           1522
5:                    100.000                           1694
6:                     97.845                           2134
7:                     65.868                            905
8:                    100.000                           1080
  number_of_1_to_4_family_units lender co_applicant
1:                          2202   3507         TRUE
2:                           694   6325        FALSE
3:                          1561   3549        FALSE
4:                          1730   2111         TRUE
5:                          2153   3229        FALSE
6:                          2993   1574         TRUE
7:                          3308   3110        FALSE
8:                          1492   6314        FALSE

Dplyr

Dplyr 是业界广泛使用的数据操作包。它由五个关键的数据操作函数组成,也称为动词,即选择、过滤、排列、变异和汇总。

电阻

# Installing the package
install.packages("dplyr")
  
# Loading package
library(dplyr)
  
# Importing dataset
Gov_mortage <- fread("Government_Mortage.csv")
  
# Select
select(Gov_mortage, state_code)
  
# Mutate
m <- mutate(Gov_mortage, 
            amount = loan_amount - applicant_income)
m
  
# Filter
f = filter(Gov_mortage, county_code == 80)
f
  
# Arrange
arrange(Gov_mortage, county_code == 80)
  
# Summarize
summarise(f, max_loan = max(loan_amount))

输出:

# Filter
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md
1      16         1             1            3         2         177           3    333
2      25         1             1            3         1         292           3    333
3     585         1             1            3         1         134           3    333
4    1033         1             1            1         1         349           2    333
5    1120         1             1            1         1         109           3    333
6    1303         1             1            3         2         166           3    333
7    1758         1             1            2         1          45           3    333
8    2053         3             1            1         1         267           3    333
9    3305         1             1            3         1         392           3    333
10   3555         1             1            3         1          98           3    333
11   3769         1             1            3         1         288           3    333
12   3807         1             1            1         1         185           3    333
13   3840         1             1            3         1         280           3    333
14   5356         1             1            3         1         123           3    333
15   5604         2             1            1         1         191           1    333
16   6294         1             1            2         1         102           3    333
17   7631         3             1            3         1         107           3    333
18   8222         2             1            3         1          62           3    333
19   9335         1             1            3         1         113           3    333
20  10137         1             1            1         1         204           3    333
21  10387         3             1            1         1         434           2    333
22  10601         2             1            1         1         299           2    333
23  13076         1             1            1         1         586           3    333
24  13763         1             1            3         1          29           3    333
25  13769         3             1            1         1         262           3    333
26  13818         2             1            1         1         233           3    333
27  14102         1             1            3         1         130           3    333
28  14196         1             1            2         1           3           3    333
29  15569         1             1            1         1         536           2    333
30  15863         1             1            1         1          20           3    333
31  16184         1             1            3         1         755           3    333
32  16296         1             1            1         2         123           2    333
33  16328         1             1            3         1         153           3    333
34  16486         3             1            3         1          95           3    333
35  16684         1             1            2         1          26           3    333
36  16922         1             1            1         1         160           2    333
37  17470         1             1            3         1         174           3    333
38  18336         1             2            1         1          37           3    333
39  18586         1             2            1         1         114           3    333
40  19249         1             1            3         1         422           3    333
41  19405         1             1            1         1         288           2    333
42  19421         1             1            2         1         301           3    333
43  20125         1             1            3         1         449           3    333
44  20388         1             1            1         1         494           3    333
45  21434         1             1            3         1         251           3    333
  state_code county_code applicant_ethnicity applicant_race applicant_sex
1           6          80                   1              5             2
2           6          80                   2              3             2
3           6          80                   2              5             2
4           6          80                   3              6             1
5           6          80                   2              5             2
6           6          80                   1              5             1
7           6          80                   2              5             1
8           6          80                   1              5             2
9           6          80                   2              5             1
10          6          80                   2              5             1
11          6          80                   2              5             1
12          6          80                   1              5             1
13          6          80                   1              5             1
14          6          80                   1              5             1
15          6          80                   2              5             1
16          6          80                   2              5             1
17          6          80                   3              6             3
18          6          80                   2              5             1
19          6          80                   2              5             2
20          6          80                   2              5             1
21          6          80                   2              5             1
22          6          80                   2              5             1
23          6          80                   3              6             3
24          6          80                   2              5             2
25          6          80                   2              5             1
26          6          80                   2              5             1
27          6          80                   2              5             2
28          6          80                   1              6             1
29          6          80                   2              5             1
30          6          80                   2              5             1
31          6          80                   2              5             1
32          6          80                   2              5             2
33          6          80                   1              5             1
34          6          80                   2              5             1
35          6          80                   2              5             1
36          6          80                   2              5             2
37          6          80                   2              5             1
38          6          80                   1              5             1
39          6          80                   1              5             1
40          6          80                   1              5             1
41          6          80                   2              2             1
42          6          80                   2              5             1
43          6          80                   2              5             1
44          6          80                   2              5             1
45          6          80                   2              5             1
  applicant_income population minority_population_pct ffiecmedian_family_income
1                NA       6420                  29.818                     68065
2                99       4346                  16.489                     70745
3                46       6782                  20.265                     69818
4               236       9813                  15.168                     69691
5                49       5854                  35.968                     70555
6               148       4234                  19.864                     72156
7               231       5699                  17.130                     71892
8                48       6537                  13.024                     71562
9               219      18911                  26.595                     69795
10               71       8454                  17.436                     68727
11               94       6304                  13.490                     69181
12               78       9451                  14.684                     69337
13               74      15540                  43.148                     70000
14               54      16183                  42.388                     70862
15               73      11198                  40.481                     70039
16              199      12133                  10.971                     70023
17               43      10712                  33.973                     68117
18              115       8759                  17.669                     70526
19               59      24887                  32.833                     71510
20              135      25252                  31.854                     69602
21              108       6507                  13.613                     70267
22              191       9261                  22.583                     71505
23              430       7943                  19.990                     70801
24              206       7193                  18.002                     69973
25              150       7413                  14.092                     68202
26               94       7611                  14.618                     71260
27               81      10946                  34.220                     70386
28               64      10438                  36.395                     70141
29              387       8258                  20.666                     69409
30               80       7525                  26.604                     70104
31               NA       4525                  20.299                     71947
32               40       8397                  32.542                     68087
33               87      20083                  19.750                     69893
34               96      20539                  19.673                     72152
35               45      10497                  12.920                     70134
36               54      15686                  26.071                     70890
37              119       7558                  14.710                     69052
38               62      25960                  32.858                     68061
39               18       5790                  39.450                     68878
40              103      18086                  26.099                     69925
41               70       8689                  31.467                     70794
42               38       3923                  30.206                     68821
43              183       6522                  13.795                     69779
44              169      18459                  26.874                     69392
45              140      15954                  25.330                     71096
  tract_to_msa_md_income_pct number_of_owner-occupied_units
1                     100.000                           1553
2                     100.000                           1198
3                     100.000                           1910
4                     100.000                           2351
5                     100.000                           1463
6                     100.000                           1276
7                     100.000                           1467
8                     100.000                           1766
9                     100.000                           4316
10                     90.071                           2324
11                    100.000                           1784
12                    100.000                           2357
13                    100.000                           3252
14                    100.000                           3319
15                     79.049                           2438
16                    100.000                           3639
17                    100.000                           2612
18                     87.201                           2345
19                    100.000                           6713
20                    100.000                           6987
21                    100.000                           1788
22                     91.023                           2349
23                    100.000                           1997
24                    100.000                           2012
25                    100.000                           2359
26                    100.000                           2304
27                    100.000                           2674
28                     80.957                           2023
29                    100.000                           2034
30                    100.000                           2343
31                     77.707                           1059
32                    100.000                           1546
33                    100.000                           5929
34                    100.000                           6017
35                    100.000                           3542
36                    100.000                           4277
37                    100.000                           2316
38                    100.000                           6989
39                     56.933                           1021
40                    100.000                           4183
41                    100.000                           1540
42                    100.000                            882
43                    100.000                           1774
44                    100.000                           4417
45                    100.000                           4169
  number_of_1_to_4_family_units lender co_applicant
1                           2001   3354        FALSE
2                           1349   2458        FALSE
3                           2326   4129        FALSE
4                           2928   4701         TRUE
5                           1914   2134        FALSE
6                           1638   5710        FALSE
7                           1670   3110        FALSE
8                           1926   3080         TRUE
9                           5241   5710         TRUE
10                          3121   5710        FALSE
11                          1953    933        FALSE
12                          2989    186         TRUE
13                          4482   2134         TRUE
14                          4380   5339         TRUE
15                          3495   5267         TRUE
16                          4875   1831         TRUE
17                          3220   5710        FALSE
18                          3024   3885         TRUE
19                          7980   2458        FALSE
20                          7949   6240         TRUE
21                          2015    542         TRUE
22                          3215   2702         TRUE
23                          2361   3216        FALSE
24                          2482   6240        FALSE
25                          2597   3970         TRUE
26                          2503   3264        FALSE
27                          3226   2570         TRUE
28                          3044   6240        FALSE
29                          2423   1928         TRUE
30                          2659   5738        FALSE
31                          1544   2458        FALSE
32                          2316   3950        FALSE
33                          7105   3143        FALSE
34                          7191   4701        FALSE
35                          4325   5339        FALSE
36                          5188   2702        FALSE
37                          2531   2458         TRUE
38                          7976   2318         TRUE
39                          1755   5026        FALSE
40                          5159   4931         TRUE
41                          2337   2352        FALSE
42                          1317   2458        FALSE
43                          1949   5726        FALSE
44                          5055   5316         TRUE
45                          5197   5726        FALSE
[ reached 'max' / getOption("max.print") -- omitted 1034 rows ]
# 

ggplot2

ggplot2 也称为Grammer of Graphics是一个免费、开源且易于使用的可视化包,广泛应用于 R 中。它是 Hadley Wickham 编写的最强大的可视化包。

电阻

# Installing the package 
install.packages("dplyr") 
install.packages("ggplot2")
    
# Loading packages 
library(dplyr) 
library(ggplot2)
  
# Data Layer 
ggplot(data = mtcars) 
     
# Aesthetic Layer 
ggplot(data = mtcars, aes(x = hp, 
                          y = mpg,
                          col = disp)) 
     
# Geometric layer 
ggplot(data = mtcars,  
       aes(x = hp, y = mpg, 
           col = disp)) + geom_point()

输出:

  • 几何层:

输出

  • 几何层 – 添加大小:

插入符号

称为分类和回归训练的插入符号使用许多函数来训练和绘制分类和回归模型。它是 R 开发人员和各种机器学习竞赛中使用最广泛的软件包之一。

电阻

# Installing Packages 
install.packages("e1071") 
install.packages("caTools") 
install.packages("caret") 
    
# Loading package 
library(e1071) 
library(caTools) 
library(caret) 
    
# Loading data 
data(iris) 
  
# Splitting data into train 
# and test data 
split <- sample.split(iris, SplitRatio = 0.7) 
train_cl <- subset(iris, split == "TRUE") 
test_cl <- subset(iris, split == "FALSE") 
    
# Feature Scaling 
train_scale <- scale(train_cl[, 1:4]) 
test_scale <- scale(test_cl[, 1:4]) 
    
# Fitting Naive Bayes Model  
# to training dataset 
set.seed(120)  # Setting Seed 
classifier_cl <- naiveBayes(Species ~ ., 
                            data = train_cl) 
classifier_cl 
    
# Predicting on test data' 
y_pred <- predict(classifier_cl,
                  newdata = test_cl) 
    
# Confusion Matrix 
cm <- table(test_cl$Species, y_pred) 
cm

输出:

  • 模型分类器_cl:

  • 混淆矩阵:

输出

e1071

e1071包用于执行聚类算法、支持向量机(SVM)、最短路径计算、袋装聚类算法、K-NN 算法等。主要用于执行 K-NN 算法。这取决于它的 k 值(邻居),并在金融行业、医疗保健行业许多行业找到它的应用。 K-最近邻居或 K-NN 是一种监督非线性分类算法。 K-NN 是一种非参数算法,即它不对基础数据或其分布做出任何假设。

电阻

# Installing Packages 
install.packages("e1071") 
install.packages("caTools") 
install.packages("class") 
    
# Loading package 
library(e1071) 
library(caTools) 
library(class) 
    
# Loading data 
data(iris) 
    
# Splitting data into train 
# and test data 
split <- sample.split(iris, SplitRatio = 0.7) 
train_cl <- subset(iris, split == "TRUE") 
test_cl <- subset(iris, split == "FALSE") 
    
# Feature Scaling 
train_scale <- scale(train_cl[, 1:4]) 
test_scale <- scale(test_cl[, 1:4]) 
    
# Fitting KNN Model  
# to training dataset 
classifier_knn <- knn(train = train_scale, 
                      test = test_scale, 
                      cl = train_cl$Species, 
                      k = 1) 
classifier_knn 
    
# Confusiin Matrix 
cm <- table(test_cl$Species, classifier_knn) 
cm 
    
# Model Evaluation - Choosing K 
# Calculate out of Sample error 
misClassError <- mean(classifier_knn != test_cl$Species) 
print(paste('Accuracy =', 1 - misClassError))

输出:

  • 模型分类器_knn(k=1):

  • 混淆矩阵:

输出输出

  • 模型评估(k=1):

输出

XGBoost

XGBoost仅适用于数字变量。它是提升技术的一部分,其中更智能地选择样本以对观察进行分类。在 C++、R、 Python、Julia、 Java和 Scala 中有 XGBoost 的接口。它由BaggingBoosting技术组成。数据集使用BigMart

电阻

# Installing Packages 
install.packages("data.table") 
install.packages("dplyr") 
install.packages("ggplot2") 
install.packages("caret") 
install.packages("xgboost") 
install.packages("e1071") 
install.packages("cowplot") 
    
# Loading packages 
library(data.table) # for reading and manipulation of data 
library(dplyr)      # for data manipulation and joining 
library(ggplot2)    # for ploting  
library(caret)      # for modeling 
library(xgboost)    # for building XGBoost model 
library(e1071)      # for skewness 
library(cowplot)    # for combining multiple plots  
    
# Setting test dataset 
# Combining datasets 
# add Item_Outlet_Sales to test data 
test[, Item_Outlet_Sales := NA]  
combi = rbind(train, test) 
    
# Missing Value Treatment 
missing_index = which(is.na(combi$Item_Weight)) 
for(i in missing_index){ 
  item = combi$Item_Identifier[i] 
  combi$Item_Weight[i] = mean(combi$Item_Weight 
                         [combi$Item_Identifier == item],  
                         na.rm = T) 
} 
    
# Replacing 0 in Item_Visibility with mean 
zero_index = which(combi$Item_Visibility == 0) 
for(i in zero_index){ 
  item = combi$Item_Identifier[i] 
  combi$Item_Visibility[i] = mean( 
       combi$Item_Visibility[combi$Item_Identifier == item],  
       na.rm = T) 
} 
    
# Label Encoding 
# To convert categorical in numerical 
combi[, Outlet_Size_num :=  
           ifelse(Outlet_Size == "Small", 0, 
           ifelse(Outlet_Size == "Medium", 1, 2))] 
    
combi[, Outlet_Location_Type_num :=  
           ifelse(Outlet_Location_Type == "Tier 3", 0, 
           ifelse(Outlet_Location_Type == "Tier 2", 1, 2))] 
    
combi[, c("Outlet_Size", "Outlet_Location_Type") := NULL] 
    
# One Hot Encoding 
# To convert categorical in numerical 
ohe_1 = dummyVars("~.", 
        data = combi[, -c("Item_Identifier",  
                    "Outlet_Establishment_Year", 
                    "Item_Type")], fullRank = T) 
ohe_df = data.table(predict(ohe_1,  
         combi[, -c("Item_Identifier",  
         "Outlet_Establishment_Year", "Item_Type")])) 
    
combi = cbind(combi[, "Item_Identifier"], ohe_df) 
    
# Remove skewness 
skewness(combi$Item_Visibility)  
skewness(combi$price_per_unit_wt) 
    
# log + 1 to avoid division by zero 
combi[, Item_Visibility := log(Item_Visibility + 1)]  
    
# Scaling and Centering data 
# index of numeric features 
num_vars = which(sapply(combi, is.numeric))  
num_vars_names = names(num_vars) 
    
combi_numeric = combi[, setdiff(num_vars_names,  
                "Item_Outlet_Sales"), with = F] 
    
prep_num = preProcess(combi_numeric, 
                 method = c("center", "scale")) 
combi_numeric_norm = predict(prep_num, combi_numeric) 
    
# removing numeric independent variables 
combi[, setdiff(num_vars_names,  
               "Item_Outlet_Sales") := NULL]  
combi = cbind(combi,  
              combi_numeric_norm) 
    
# Splitting data back to train and test 
train = combi[1:nrow(train)] 
test = combi[(nrow(train) + 1):nrow(combi)] 
    
# Removing Item_Outlet_Sales 
test[, Item_Outlet_Sales := NULL]  
    
# Model Building: XGBoost 
param_list = list( 
  objective = "reg:linear", 
  eta = 0.01, 
  gamma = 1, 
  max_depth = 6, 
  subsample = 0.8, 
  colsample_bytree = 0.5 
) 
    
# Converting train and test into xgb.DMatrix format 
Dtrain = xgb.DMatrix( 
         data = as.matrix(train[, -c("Item_Identifier",  
                                "Item_Outlet_Sales")]),  
         label = train$Item_Outlet_Sales) 
Dtest = xgb.DMatrix( 
         data = as.matrix(test[, -c("Item_Identifier")])) 
    
# 5-fold cross-validation to  
# find optimal value of nrounds 
set.seed(112)  # Setting seed 
xgbcv = xgb.cv(params = param_list,  
               data = Dtrain,  
               nrounds = 1000,  
               nfold = 5,  
               print_every_n = 10,  
               early_stopping_rounds = 30,  
               maximize = F) 
    
# Training XGBoost model at nrounds = 428 
xgb_model = xgb.train(data = Dtrain,  
                      params = param_list,  
                      nrounds = 428) 
xgb_model

输出:

  • Xgboost模型的训练:

输出

  • 型号 xgb_model:

输出

随机森林

R 编程中的随机森林是决策树的集合。它构建并组合多个决策树以获得更准确的预测。这是一种非线性分类算法。每个决策树模型在单独使用时都会使用。构建树时不使用案例的错误估计。这称为以百分比形式提及的袋外误差估计。

电阻

# Installing package 
  
# For sampling the dataset 
install.packages("caTools")
  
# For implementing random forest algorithm 
install.packages("randomForest")  
    
# Loading package 
library(caTools) 
library(randomForest) 
    
# Loading data 
data(iris) 
  
# Splitting data in train and test data 
split <- sample.split(iris, SplitRatio = 0.7) 
split 
    
train <- subset(iris, split == "TRUE") 
test <- subset(iris, split == "FALSE") 
    
# Fitting Random Forest to the train dataset 
set.seed(120)  # Setting seed 
classifier_RF = randomForest(x = train[-5], 
                             y = train$Species, 
                             ntree = 500) 
    
classifier_RF 
    
# Predicting the Test set results 
y_pred = predict(classifier_RF, newdata = test[-5]) 
    
# Confusion Matrix 
confusion_mtx = table(test[, 5], y_pred) 
confusion_mtx

输出:

  • 模型分类器_RF:

输出

  • 混淆矩阵: