정형 데이터마이닝 blackfriday

setwd("c:/ADP/data")

BlackFriday <- read.csv("BlackFriday.csv")
str(BlackFriday)

# 'data.frame':	537577 obs. of  12 variables:
#   $ User_ID                   : int  1000001 1000001 1000001 1000001 1000002 1000003 1000004 1000004 1000004 1000005 ...
# $ Product_ID                : chr  "P00069042" "P00248942" "P00087842" "P00085442" ...
# $ Gender                    : chr  "F" "F" "F" "F" ...
# $ Age                       : chr  "0-17" "0-17" "0-17" "0-17" ...
# $ Occupation                : int  10 10 10 10 16 15 7 7 7 20 ...
# $ City_Category             : chr  "A" "A" "A" "A" ...
# $ Stay_In_Current_City_Years: chr  "2" "2" "2" "2" ...
# $ Marital_Status            : int  0 0 0 0 0 0 1 1 1 1 ...
# $ Product_Category_1        : int  3 1 12 12 8 1 1 1 1 8 ...
# $ Product_Category_2        : int  NA 6 NA 14 NA 2 8 15 16 NA ...
# $ Product_Category_3        : int  NA 14 NA NA NA NA 17 NA NA NA ...
# $ Purchase                  : int  8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...

# 모든 컬럼에 있는 NA 개수를 구해준다.
# product_Category_2, product_Category_3에 NA가 존재
colSums(is.na(BlackFriday))

# User_ID                 Product_ID                     Gender                        Age 
# 0                          0                          0                          0 
# Occupation              City_Category Stay_In_Current_City_Years             Marital_Status 
# 0                          0                          0                          0 
# Product_Category_1         Product_Category_2         Product_Category_3                   Purchase 
# 0                     166986                     373299                          0


# 결측치 대체
# ifelse를 통해 결측치 NA값을 0으로 대체한다.
BlackFriday$Product_Category_2 <- ifelse(is.na(BlackFriday$Product_Category_2)==TRUE, 0, BlackFriday$Product_Category_2)

BlackFriday$Product_Category_3 <- ifelse(is.na(BlackFriday$Product_Category_3)==TRUE, 0, BlackFriday$Product_Category_3)


# summary 함수를 통해 확인한 결과 NA 값이 없음을 알 수 있다.
summary(BlackFriday)

# User_ID         Product_ID           Gender              Age              Occupation     City_Category      Stay_In_Current_City_Years Marital_Status   Product_Category_1 Product_Category_2 Product_Category_3
# Min.   :1000001   Length:537577      Length:537577      Length:537577      Min.   : 0.000   Length:537577      Length:537577              Min.   :0.0000   Min.   : 1.000     Min.   : 0.000     Min.   : 0.000    
# 1st Qu.:1001495   Class :character   Class :character   Class :character   1st Qu.: 2.000   Class :character   Class :character           1st Qu.:0.0000   1st Qu.: 1.000     1st Qu.: 0.000     1st Qu.: 0.000    
# Median :1003031   Mode  :character   Mode  :character   Mode  :character   Median : 7.000   Mode  :character   Mode  :character           Median :0.0000   Median : 5.000     Median : 5.000     Median : 0.000    
# Mean   :1002992                                                            Mean   : 8.083                                                 Mean   :0.4088   Mean   : 5.296     Mean   : 6.785     Mean   : 3.872    
# 3rd Qu.:1004417                                                            3rd Qu.:14.000                                                 3rd Qu.:1.0000   3rd Qu.: 8.000     3rd Qu.:14.000     3rd Qu.: 8.000    
# Max.   :1006040                                                            Max.   :20.000                                                 Max.   :1.0000   Max.   :18.000     Max.   :18.000     Max.   :18.000    
# Purchase    
# Min.   :  185  
# 1st Qu.: 5866  
# Median : 8062  
# Mean   : 9334  
# 3rd Qu.:12073  
# Max.   :23961


# product_all 변수를 추가한다.

# 문제에 주어진 특정변수 데이터 타입을 확인한 뒤, 적절한 타입으로 변환한다.
BlackFriday<-transform(BlackFriday, product_all=Product_Category_1 + Product_Category_2 + Product_Category_3)

# 데이터 형태 변환
BlackFriday$User_ID<-as.character(BlackFriday$User_ID)
BlackFriday$Occupation<-as.factor(BlackFriday$Occupation)
BlackFriday$Marital_Status<-as.factor(BlackFriday$Marital_Status)
BlackFriday$Product_Category_1<-as.factor(BlackFriday$Product_Category_1)
BlackFriday$Product_Category_2<-as.factor(BlackFriday$Product_Category_2)
BlackFriday$Product_Category_3<-as.factor(BlackFriday$Product_Category_3)

str(BlackFriday)

# 'data.frame':	537577 obs. of  13 variables:
# $ User_ID                   : chr  "1000001" "1000001" "1000001" "1000001" ...
# $ Product_ID                : chr  "P00069042" "P00248942" "P00087842" "P00085442" ...
# $ Gender                    : chr  "F" "F" "F" "F" ...
# $ Age                       : chr  "0-17" "0-17" "0-17" "0-17" ...
# $ Occupation                : Factor w/ 21 levels "0","1","2","3",..: 11 11 11 11 17 16 8 8 8 21 ...
# $ City_Category             : chr  "A" "A" "A" "A" ...
# $ Stay_In_Current_City_Years: chr  "2" "2" "2" "2" ...
# $ Marital_Status            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 2 2 ...
# $ Product_Category_1        : Factor w/ 18 levels "1","2","3","4",..: 3 1 12 12 8 1 1 1 1 8 ...
# $ Product_Category_2        : Factor w/ 18 levels "0","2","3","4",..: 1 6 1 14 1 2 8 15 16 1 ...
# $ Product_Category_3        : Factor w/ 16 levels "0","3","4","5",..: 1 12 1 1 1 1 15 1 1 1 ...
# $ Purchase                  : int  8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...
# $ product_all               : num  3 21 12 26 8 3 26 16 17 8 ...

# 더미화를 위해 해당 변수에 대해 수치화한 후, caret 패키지의 dummyVars를 활용하여 더미화를 진행한다.

# 더미 변수화(Gender, Age, City_Category, Stay_In_Current_City_Years)
# 더미화를 위해 해당 변수 수치화

install.packages(c("caret","dplyr"))
library(caret)
library(dplyr)


# mutate 는 데이터프레임에 조건을 만족하는 새로운 열(변수)를 만들거나,
# 기존의 열을 조건에 맞게 변경할 때 사용합니다.
# 단, 새로 생성된 칼럼은 별도의 변수로 지정하거나 기존의 데이터에 덮어씌우지 않는 한 저장되지 않습니다.

BlackFriday_1 <- BlackFriday %>% mutate(Gender_binary = as.numeric(Gender),
                                      Age_binary = as.numeric(Age),
                                      City_Category_numeric = as.numeric(City_Category),
                                      Stay_In_Current_City_Years_numeric = as.numeric(Stay_In_Current_City_Years))

dummy <- dummyVars("~ Gender + Age + City_Category + Stay_In_Current_City_Years", data = BlackFriday)

new_df <- data.frame(predict(dummy,newdata=BlackFriday))
BlackFriday_2<-cbind(BlackFriday,new_df)
str(BlackFriday_2)

# 'data.frame':	537577 obs. of  30 variables:
#   $ User_ID                     : chr  "1000001" "1000001" "1000001" "1000001" ...
# $ Product_ID                  : chr  "P00069042" "P00248942" "P00087842" "P00085442" ...
# $ Gender                      : chr  "F" "F" "F" "F" ...
# $ Age                         : chr  "0-17" "0-17" "0-17" "0-17" ...
# $ Occupation                  : Factor w/ 21 levels "0","1","2","3",..: 11 11 11 11 17 16 8 8 8 21 ...
# $ City_Category               : chr  "A" "A" "A" "A" ...
# $ Stay_In_Current_City_Years  : chr  "2" "2" "2" "2" ...
# $ Marital_Status              : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 2 2 ...
# $ Product_Category_1          : Factor w/ 18 levels "1","2","3","4",..: 3 1 12 12 8 1 1 1 1 8 ...
# $ Product_Category_2          : Factor w/ 18 levels "0","2","3","4",..: 1 6 1 14 1 2 8 15 16 1 ...
# $ Product_Category_3          : Factor w/ 16 levels "0","3","4","5",..: 1 12 1 1 1 1 15 1 1 1 ...
# $ Purchase                    : int  8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...
# $ product_all                 : num  3 21 12 26 8 3 26 16 17 8 ...
# $ GenderF                     : num  1 1 1 1 0 0 0 0 0 0 ...
# $ GenderM                     : num  0 0 0 0 1 1 1 1 1 1 ...
# $ Age0.17                     : num  1 1 1 1 0 0 0 0 0 0 ...
# $ Age18.25                    : num  0 0 0 0 0 0 0 0 0 0 ...
# $ Age26.35                    : num  0 0 0 0 0 1 0 0 0 1 ...
# $ Age36.45                    : num  0 0 0 0 0 0 0 0 0 0 ...
# $ Age46.50                    : num  0 0 0 0 0 0 1 1 1 0 ...
# $ Age51.55                    : num  0 0 0 0 0 0 0 0 0 0 ...
# $ Age55.                      : num  0 0 0 0 1 0 0 0 0 0 ...
# $ City_CategoryA              : num  1 1 1 1 0 1 0 0 0 1 ...
# $ City_CategoryB              : num  0 0 0 0 0 0 1 1 1 0 ...
# $ City_CategoryC              : num  0 0 0 0 1 0 0 0 0 0 ...
# $ Stay_In_Current_City_Years0 : num  0 0 0 0 0 0 0 0 0 0 ...
# $ Stay_In_Current_City_Years1 : num  0 0 0 0 0 0 0 0 0 1 ...
# $ Stay_In_Current_City_Years2 : num  1 1 1 1 0 0 1 1 1 0 ...
# $ Stay_In_Current_City_Years3 : num  0 0 0 0 0 1 0 0 0 0 ...
# $ Stay_In_Current_City_Years4.: num  0 0 0 0 1 0 0 0 0 0 ...

# 특정변수 제외
BlackFriday_cluster <- BlackFriday_2 %>% select(-User_ID,
                                                -Product_ID,
                                                -Gender,
                                                -Age,
                                                -City_Category,
                                                -Stay_In_Current_City_Years,
                                                -product_all)

str(BlackFriday_cluster)

# 범주형 변수를 수치형 변수로 변환한다.
BlackFriday_cluster$Occupation<-as.numeric(BlackFriday_cluster$Occupation)
BlackFriday_cluster$Marital_Status<-as.numeric(BlackFriday_cluster$Marital_Status)
BlackFriday_cluster$Product_Category_1<-as.numeric(BlackFriday_cluster$Product_Category_1)
BlackFriday_cluster$Product_Category_2<-as.numeric(BlackFriday_cluster$Product_Category_2)
BlackFriday_cluster$Product_Category_3<-as.numeric(BlackFriday_cluster$Product_Category_3)
str(BlackFriday_cluster)

# 'data.frame':	537577 obs. of  27 variables:
#   $ Occupation                        : num  11 11 11 11 17 16 8 8 8 21 ...
# $ Marital_Status                    : num  1 1 1 1 1 1 2 2 2 2 ...
# $ Product_Category_1                : num  3 1 12 12 8 1 1 1 1 8 ...
# $ Product_Category_2                : num  1 6 1 14 1 2 8 15 16 1 ...
# $ Product_Category_3                : num  1 12 1 1 1 1 15 1 1 1 ...
# $ Purchase                          : int  8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...
# $ Gender_binary                     : num  NA NA NA NA NA NA NA NA NA NA ...
# $ Age_binary                        : num  NA NA NA NA NA NA NA NA NA NA ...
# $ City_Category_numeric             : num  NA NA NA NA NA NA NA NA NA NA ...
# $ Stay_In_Current_City_Years_numeric: num  2 2 2 2 NA 3 2 2 2 1 ...
# $ GenderF                           : num  1 1 1 1 0 0 0 0 0 0 ...
# $ GenderM                           : num  0 0 0 0 1 1 1 1 1 1 ...
# $ Age0.17                           : num  1 1 1 1 0 0 0 0 0 0 ...
# $ Age18.25                          : num  0 0 0 0 0 0 0 0 0 0 ...
# $ Age26.35                          : num  0 0 0 0 0 1 0 0 0 1 ...
# $ Age36.45                          : num  0 0 0 0 0 0 0 0 0 0 ...
# $ Age46.50                          : num  0 0 0 0 0 0 1 1 1 0 ...
# $ Age51.55                          : num  0 0 0 0 0 0 0 0 0 0 ...
# $ Age55.                            : num  0 0 0 0 1 0 0 0 0 0 ...
# $ City_CategoryA                    : num  1 1 1 1 0 1 0 0 0 1 ...
# $ City_CategoryB                    : num  0 0 0 0 0 0 1 1 1 0 ...
# $ City_CategoryC                    : num  0 0 0 0 1 0 0 0 0 0 ...
# $ Stay_In_Current_City_Years0       : num  0 0 0 0 0 0 0 0 0 0 ...
# $ Stay_In_Current_City_Years1       : num  0 0 0 0 0 0 0 0 0 1 ...
# $ Stay_In_Current_City_Years2       : num  1 1 1 1 0 0 1 1 1 0 ...
# $ Stay_In_Current_City_Years3       : num  0 0 0 0 0 1 0 0 0 0 ...
# $ Stay_In_Current_City_Years4.      : num  0 0 0 0 1 0 0 0 0 0 ...

# kmeans 함수를 통해 군집분석을 수행한다.
set.seed(1234)
kmeans_BF<-kmeans(BlackFriday_cluster,3)
kmeans_BF

# K-means clustering with 3 clusters of sizes 119245, 252697, 165635
# 
# Cluster means:
#   Occupation Marital_Status Product_Category_1 Product_Category_2 Product_Category_3  Purchase   GenderF   GenderM
# 1   9.312164       1.406566           3.473278           7.398918           6.324022 17055.538 0.2006709 0.7993291
# 2   9.060527       1.411584           5.136032           6.955096           3.749827  9044.255 0.2566077 0.7433923
# 3   8.951363       1.406152           6.850804           7.091376           2.705219  4216.649 0.2621668 0.7378332
# Age0.17  Age18.25  Age26.35  Age36.45   Age46.50   Age51.55     Age55. City_CategoryA City_CategoryB City_CategoryC
# 1 0.02627364 0.1803262 0.3977022 0.2041092 0.08006206 0.07376410 0.03776259      0.2353893      0.4050652      0.3595455
# 2 0.02657728 0.1775526 0.3961108 0.2002952 0.08450832 0.07279469 0.04216117      0.2689545      0.4185883      0.3124572
# 3 0.02932955 0.1887524 0.4055302 0.1964923 0.08225315 0.06295167 0.03469074      0.2934464      0.4371962      0.2693573
# Stay_In_Current_City_Years0 Stay_In_Current_City_Years1 Stay_In_Current_City_Years2 Stay_In_Current_City_Years3
# 1                   0.1307225                   0.3494822                   0.1906914                   0.1750094
# 2                   0.1364401                   0.3514090                   0.1841890                   0.1735834
# 3                   0.1368008                   0.3545024                   0.1821837                   0.1725420
# Stay_In_Current_City_Years4.
# 1                    0.1540945
# 2                    0.1543786
# 3                    0.1539711
# 
# Clustering vector:
#   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24 
# 2    1    3    3    2    1    1    1    1    2    3    3    3    1    3    3    1    2    2    1    2    2    2    3 
# 25   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48 
# 2    1    3    2    3    1    2    3    2    2    3    2    2    2    1    1    2    1    2    1    2    2    2    3 
# 49   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64   65   66   67   68   69   70   71   72 
# 1    2    1    2    3    1    2    3    1    3    2    3    3    3    3    2    2    2    2    1    2    1    3    2 
# 73   74   75   76   77   78   79   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95   96 
# 3    1    1    1    1    1    1    1    3    2    3    2    3    2    1    2    1    2    2    3    3    2    2    3 
# 97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120 
# 3    3    2    2    2    2    1    2    2    3    2    3    3    2    3    2    1    2    3    1    2    1    1    1 
# 121  122  123  124  125  126  127  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144 
# 2    3    3    2    3    2    1    1    1    1    3    3    3    3    3    2    2    3    3    3    1    3    2    2 
# 145  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160  161  162  163  164  165  166  167  168 
# 2    1    1    3    2    2    2    3    1    2    1    2    2    3    2    2    3    2    3    2    2    1    3    3 
# 169  170  171  172  173  174  175  176  177  178  179  180  181  182  183  184  185  186  187  188  189  190  191  192 
# 1    3    3    2    3    3    2    3    3    2    3    2    2    2    3    3    1    2    2    2    2    3    3    3 
# 193  194  195  196  197  198  199  200  201  202  203  204  205  206  207  208  209  210  211  212  213  214  215  216 
# 3    3    3    2    3    2    1    2    2    2    2    2    1    2    2    2    1    2    3    2    1    1    2    1 
# 217  218  219  220  221  222  223  224  225  226  227  228  229  230  231  232  233  234  235  236  237  238  239  240 
# 1    1    1    2    2    1    3    3    1    3    2    2    1    2    3    3    3    2    1    1    2    2    3    2 
# 241  242  243  244  245  246  247  248  249  250  251  252  253  254  255  256  257  258  259  260  261  262  263  264 
# 2    1    3    1    3    2    2    3    2    3    3    3    3    3    3    3    3    2    2    3    3    2    2    2 
# 265  266  267  268  269  270  271  272  273  274  275  276  277  278  279  280  281  282  283  284  285  286  287  288 
# 3    3    1    2    3    2    1    2    3    1    2    2    2    3    3    3    1    2    1    2    2    2    3    3 
# 289  290  291  292  293  294  295  296  297  298  299  300  301  302  303  304  305  306  307  308  309  310  311  312 
# 3    3    3    2    2    1    1    2    2    3    3    1    1    1    1    2    3    2    1    2    1    1    1    3 
# 313  314  315  316  317  318  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336 
# 2    1    2    2    1    1    1    3    2    2    3    2    1    2    2    2    2    2    1    2    1    3    3    3 
# 337  338  339  340  341  342  343  344  345  346  347  348  349  350  351  352  353  354  355  356  357  358  359  360 
# 2    2    3    3    2    3    1    1    2    1    1    2    2    2    2    1    3    2    2    3    3    2    2    2 
# 361  362  363  364  365  366  367  368  369  370  371  372  373  374  375  376  377  378  379  380  381  382  383  384 
# 2    3    2    3    3    3    3    2    2    2    3    3    1    3    2    1    1    2    1    1    2    2    2    2 
# 385  386  387  388  389  390  391  392  393  394  395  396  397  398  399  400  401  402  403  404  405  406  407  408 
# 1    3    2    2    3    2    2    3    1    1    1    1    2    1    2    2    2    2    2    2    2    2    2    2 
# 409  410  411  412  413  414  415  416  417  418  419  420  421  422  423  424  425  426  427  428  429  430  431  432 
# 2    3    2    2    3    2    2    1    1    2    2    1    1    1    2    1    1    2    3    1    1    3    2    2 
# 433  434  435  436  437  438  439  440  441  442  443  444  445  446  447  448  449  450  451  452  453  454  455  456 
# 2    2    2    1    2    3    2    3    3    3    2    2    2    2    3    2    2    2    1    2    2    2    2    2 
# 457  458  459  460  461  462  463  464  465  466  467  468  469  470  471  472  473  474  475  476  477  478  479  480 
# 2    1    2    2    2    2    3    3    2    1    3    1    1    3    3    3    2    2    2    2    3    3    2    2 
# 481  482  483  484  485  486  487  488  489  490  491  492  493  494  495  496  497  498  499  500  501  502  503  504 
# 2    2    1    3    3    2    3    3    3    2    2    3    2    3    2    2    1    3    3    3    3    3    3    3 
# 505  506  507  508  509  510  511  512  513  514  515  516  517  518  519  520  521  522  523  524  525  526  527  528 
# 3    3    1    3    2    3    2    2    1    1    1    2    2    2    1    2    1    2    2    3    2    3    1    3 
# 529  530  531  532  533  534  535  536  537  538  539  540  541  542  543  544  545  546  547  548  549  550  551  552 
# 2    2    3    2    2    2    3    2    3    2    2    3    1    2    2    3    2    2    2    1    1    1    1    2 
# 553  554  555  556  557  558  559  560  561  562  563  564  565  566  567  568  569  570  571  572  573  574  575  576 
# 3    2    3    3    2    3    1    1    1    1    3    2    3    2    2    2    1    2    1    3    3    2    3    3 
# 577  578  579  580  581  582  583  584  585  586  587  588  589  590  591  592  593  594  595  596  597  598  599  600 
# 1    2    2    2    1    3    1    1    2    2    3    1    1    2    2    2    2    3    2    3    2    1    2    2 
# 601  602  603  604  605  606  607  608  609  610  611  612  613  614  615  616  617  618  619  620  621  622  623  624 
# 3    3    3    3    3    2    2    1    1    1    2    2    1    2    1    3    2    3    2    3    3    1    1    3 
# 625  626  627  628  629  630  631  632  633  634  635  636  637  638  639  640  641  642  643  644  645  646  647  648 
# 1    3    3    3    2    3    1    1    1    2    1    1    1    3    1    1    2    2    3    2    2    2    3    1 
# 649  650  651  652  653  654  655  656  657  658  659  660  661  662  663  664  665  666  667  668  669  670  671  672 
# 1    1    1    2    1    3    1    3    1    3    2    2    1    1    3    1    1    3    3    1    1    1    1    1 
# 673  674  675  676  677  678  679  680  681  682  683  684  685  686  687  688  689  690  691  692  693  694  695  696 
# 1    1    1    1    2    2    2    2    3    3    3    2    2    2    2    2    3    3    2    2    2    2    2    2 
# 697  698  699  700  701  702  703  704  705  706  707  708  709  710  711  712  713  714  715  716  717  718  719  720 
# 3    1    3    3    1    1    3    2    3    3    2    3    3    1    2    1    3    2    2    3    3    2    2    1 
# 721  722  723  724  725  726  727  728  729  730  731  732  733  734  735  736  737  738  739  740  741  742  743  744 
# 1    2    2    2    1    2    1    2    1    2    2    1    2    1    1    2    1    1    1    3    1    1    3    3 
# 745  746  747  748  749  750  751  752  753  754  755  756  757  758  759  760  761  762  763  764  765  766  767  768 
# 3    3    3    1    2    1    2    1    2    1    3    2    3    1    3    1    2    3    1    2    2    3    2    1 
# 769  770  771  772  773  774  775  776  777  778  779  780  781  782  783  784  785  786  787  788  789  790  791  792 
# 2    3    2    2    2    1    3    3    2    2    1    2    2    1    2    2    1    2    1    1    1    1    1    2 
# 793  794  795  796  797  798  799  800  801  802  803  804  805  806  807  808  809  810  811  812  813  814  815  816 
# 1    1    3    3    3    2    2    2    2    2    3    2    2    3    3    3    1    3    3    3    3    1    1    1 
# 817  818  819  820  821  822  823  824  825  826  827  828  829  830  831  832  833  834  835  836  837  838  839  840 
# 2    2    1    1    2    1    2    3    2    2    1    2    2    1    3    2    2    1    1    2    1    1    2    1 
# 841  842  843  844  845  846  847  848  849  850  851  852  853  854  855  856  857  858  859  860  861  862  863  864 
# 2    2    2    3    3    2    2    2    3    3    1    2    2    2    2    2    2    3    2    3    2    2    3    2 
# 865  866  867  868  869  870  871  872  873  874  875  876  877  878  879  880  881  882  883  884  885  886  887  888 
# 2    2    1    2    2    2    1    3    2    2    2    3    3    1    2    2    3    2    1    2    2    3    2    3 
# 889  890  891  892  893  894  895  896  897  898  899  900  901  902  903  904  905  906  907  908  909  910  911  912 
# 2    3    3    3    2    2    1    2    3    3    2    1    1    2    2    1    2    2    1    1    2    2    3    2 
# 913  914  915  916  917  918  919  920  921  922  923  924  925  926  927  928  929  930  931  932  933  934  935  936 
# 2    1    2    3    3    3    3    3    3    3    2    2    2    2    1    2    2    2    3    3    3    3    3    3 
# 937  938  939  940  941  942  943  944  945  946  947  948  949  950  951  952  953  954  955  956  957  958  959  960 
# 3    3    3    2    1    2    1    2    2    1    1    1    1    2    1    2    2    2    2    2    2    2    1    3 
# 961  962  963  964  965  966  967  968  969  970  971  972  973  974  975  976  977  978  979  980  981  982  983  984 
# 2    3    3    3    3    2    2    3    3    3    2    2    1    3    2    2    2    2    3    3    3    2    3    2 
# 985  986  987  988  989  990  991  992  993  994  995  996  997  998  999 1000 
# 2    2    2    3    2    1    3    1    1    1    3    1    1    1    1    3 
# [ reached getOption("max.print") -- omitted 536577 entries ]
# 
# Within cluster sum of squares by cluster:
#   [1] 636141159429 810472759392 422633696404
# (between_SS / total_SS =  86.0 %)
# 
# Available components:
#   
#   [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"         "iter"        
# [9] "ifault"      

# 해석

# kmeans 함수를 활용해 kmeans clustering을 수행했다. 군집의 수는 3개로 했으며, 각 군집에 119245,252697,165635개로 군집이 묶였다.
# between_SS / total_SS의 값이 86%로 나타나 군집이 잘되었다고 판단할 수 있다.

# Sum of square means 그래프를 통해 최적의 군집을 찾는다.

# Sum of square means 그래프로 최적의 군집 찾기
# 최소 군집 2개, 최대 군집 15개

wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data) - 1) * sum(apply(data, 2, var)) # 열(column) 단위로 var 연산
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab = "Number of Clusters",ylab = "Within groups sum of squares")}

wssplot(BlackFriday_cluster)

# 최적의 군집이 4개로 나타남


# 최적의 군집 개수로 군집의 개수를 변동하여 군집분석을 재수행한다.
# 군집의 개수를 4개로 하여 kmeans를 다시 실시
kmeans_BF_4 <- kmeans(BlackFriday_cluster, 4)
kmeans_BF_4

# K-means clustering with 4 clusters of sizes 109575, 89439, 106191, 232372
# 
# Cluster means:
#   Occupation Marital_Status Product_Category_1 Product_Category_2 Product_Category_3  Purchase   GenderF   GenderM    Age0.17
# 1   9.326899       1.406042           3.460297           7.532576           6.425453 17380.202 0.1938763 0.8061237 0.02496920
# 2   8.939165       1.403594           7.168260           7.151131           3.032883  3000.617 0.2532452 0.7467548 0.03328526
# 3   9.143760       1.412031           4.351047           6.888559           5.017516 11276.081 0.2371199 0.7628801 0.02745995
# 4   8.994913       1.410622           5.871783           6.962633           2.761155  7089.676 0.2716463 0.7283537 0.02615634
# Age18.25  Age26.35  Age36.45   Age46.50   Age51.55     Age55. City_CategoryA City_CategoryB City_CategoryC
# 1 0.1785352 0.4000639 0.2037965 0.07990874 0.07438741 0.03833904      0.2349167      0.4041433      0.3609400
# 2 0.1978332 0.4060756 0.1910911 0.07978622 0.05884458 0.03308400      0.2981697      0.4345420      0.2672883
# 3 0.1781507 0.3917564 0.2028703 0.08342515 0.07384807 0.04248948      0.2610485      0.4180298      0.3209217
# 4 0.1784165 0.3999320 0.2002565 0.08510061 0.07041296 0.03972510      0.2776066      0.4258387      0.2965547
# Stay_In_Current_City_Years0 Stay_In_Current_City_Years1 Stay_In_Current_City_Years2 Stay_In_Current_City_Years3
# 1                   0.1304860                   0.3501072                   0.1901164                   0.1749395
# 2                   0.1377587                   0.3539731                   0.1828844                   0.1727099
# 3                   0.1356518                   0.3535987                   0.1836220                   0.1711350
# 4                   0.1364235                   0.3512514                   0.1840626                   0.1743885
# Stay_In_Current_City_Years4.
# 1                    0.1543509
# 2                    0.1526739
# 3                    0.1559925
# 4                    0.1538740
# 
# Clustering vector:
#   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24   25 
# 4    1    2    2    4    1    1    1    1    4    4    2    4    1    4    2    3    4    3    1    4    3    3    4    3 
# 26   27   28   29   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49   50 
# 1    4    4    4    1    4    4    4    3    4    4    3    4    1    1    4    1    4    1    3    3    3    4    1    4 
# 51   52   53   54   55   56   57   58   59   60   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75 
# 1    4    4    1    3    4    1    4    3    4    2    2    4    4    3    3    4    1    4    1    2    3    2    1    1 
# 76   77   78   79   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95   96   97   98   99  100 
# 1    1    1    1    3    2    4    4    4    4    4    1    3    1    4    3    2    2    4    4    2    2    2    3    4 
# 101  102  103  104  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124  125 
# 3    4    1    3    4    4    4    4    2    4    2    3    1    4    4    1    3    1    1    1    3    4    2    4    2 
# 126  127  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150 
# 4    1    1    1    1    2    2    4    4    4    3    4    4    4    4    1    2    4    4    3    1    1    2    4    4 
# 151  152  153  154  155  156  157  158  159  160  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175 
# 4    2    1    4    1    3    3    2    4    4    4    3    2    4    4    1    4    4    1    4    4    4    2    4    4 
# 176  177  178  179  180  181  182  183  184  185  186  187  188  189  190  191  192  193  194  195  196  197  198  199  200 
# 2    2    4    2    4    4    4    4    4    1    4    3    4    4    4    4    4    2    4    4    3    2    4    1    4 
# 201  202  203  204  205  206  207  208  209  210  211  212  213  214  215  216  217  218  219  220  221  222  223  224  225 
# 4    4    4    4    1    4    4    4    1    3    2    4    1    1    3    1    1    1    1    4    3    1    4    4    1 
# 226  227  228  229  230  231  232  233  234  235  236  237  238  239  240  241  242  243  244  245  246  247  248  249  250 
# 4    4    3    1    3    2    2    4    4    1    1    4    4    2    4    3    3    2    1    2    3    4    2    4    2 
# 251  252  253  254  255  256  257  258  259  260  261  262  263  264  265  266  267  268  269  270  271  272  273  274  275 
# 2    2    2    2    2    4    2    4    4    2    2    4    4    4    2    2    1    3    4    4    1    3    4    1    3 
# 276  277  278  279  280  281  282  283  284  285  286  287  288  289  290  291  292  293  294  295  296  297  298  299  300 
# 4    4    2    4    4    1    4    1    3    3    3    2    2    2    2    2    3    3    3    1    3    4    4    4    1 
# 301  302  303  304  305  306  307  308  309  310  311  312  313  314  315  316  317  318  319  320  321  322  323  324  325 
# 1    1    1    4    2    4    1    3    1    1    1    4    3    1    3    3    3    1    1    2    4    3    2    3    1 
# 326  327  328  329  330  331  332  333  334  335  336  337  338  339  340  341  342  343  344  345  346  347  348  349  350 
# 3    4    3    4    4    1    4    1    4    4    4    3    3    2    2    3    2    1    1    3    3    1    4    4    4 
# 351  352  353  354  355  356  357  358  359  360  361  362  363  364  365  366  367  368  369  370  371  372  373  374  375 
# 4    1    4    4    3    4    2    4    4    3    4    4    3    4    2    2    2    4    4    4    2    2    1    4    3 
# 376  377  378  379  380  381  382  383  384  385  386  387  388  389  390  391  392  393  394  395  396  397  398  399  400 
# 1    1    3    1    1    4    4    3    4    1    4    4    3    4    4    4    2    1    3    1    1    3    1    4    4 
# 401  402  403  404  405  406  407  408  409  410  411  412  413  414  415  416  417  418  419  420  421  422  423  424  425 
# 4    3    4    3    4    3    3    4    4    2    3    4    4    4    4    1    1    3    4    1    1    1    3    3    3 
# 426  427  428  429  430  431  432  433  434  435  436  437  438  439  440  441  442  443  444  445  446  447  448  449  450 
# 3    4    1    1    2    3    3    3    4    4    1    3    4    4    4    4    2    4    4    4    4    2    4    4    4 
# 451  452  453  454  455  456  457  458  459  460  461  462  463  464  465  466  467  468  469  470  471  472  473  474  475 
# 1    3    3    4    3    4    4    1    4    4    4    3    4    2    3    1    4    1    1    2    2    2    3    4    4 
# 476  477  478  479  480  481  482  483  484  485  486  487  488  489  490  491  492  493  494  495  496  497  498  499  500 
# 4    2    4    4    4    4    4    1    2    4    3    4    2    2    3    4    4    4    4    4    4    3    2    2    2 
# 501  502  503  504  505  506  507  508  509  510  511  512  513  514  515  516  517  518  519  520  521  522  523  524  525 
# 4    4    4    2    2    2    1    2    3    4    4    3    3    1    1    4    3    3    1    4    1    3    3    2    4 
# 526  527  528  529  530  531  532  533  534  535  536  537  538  539  540  541  542  543  544  545  546  547  548  549  550 
# 2    1    2    4    4    4    4    4    3    4    4    2    3    4    4    1    3    4    2    3    4    3    1    1    1 
# 551  552  553  554  555  556  557  558  559  560  561  562  563  564  565  566  567  568  569  570  571  572  573  574  575 
# 1    3    2    4    2    2    4    2    1    1    1    1    4    4    2    3    4    4    1    3    1    2    4    3    2 
# 576  577  578  579  580  581  582  583  584  585  586  587  588  589  590  591  592  593  594  595  596  597  598  599  600 
# 2    1    4    4    4    1    4    3    1    3    3    2    1    1    4    3    4    3    4    4    2    4    1    4    3 
# 601  602  603  604  605  606  607  608  609  610  611  612  613  614  615  616  617  618  619  620  621  622  623  624  625 
# 4    2    2    2    4    4    3    1    1    1    4    3    1    3    1    4    4    4    4    2    4    1    1    2    1 
# 626  627  628  629  630  631  632  633  634  635  636  637  638  639  640  641  642  643  644  645  646  647  648  649  650 
# 2    2    4    3    2    1    1    1    3    1    1    1    4    1    1    4    4    2    3    4    4    2    1    1    1 
# 651  652  653  654  655  656  657  658  659  660  661  662  663  664  665  666  667  668  669  670  671  672  673  674  675 
# 1    3    1    2    1    4    1    4    4    4    1    1    4    1    1    2    2    1    1    1    1    1    1    1    1 
# 676  677  678  679  680  681  682  683  684  685  686  687  688  689  690  691  692  693  694  695  696  697  698  699  700 
# 1    4    4    4    4    4    4    4    4    4    4    4    4    4    2    4    4    3    4    4    4    2    1    2    2 
# 701  702  703  704  705  706  707  708  709  710  711  712  713  714  715  716  717  718  719  720  721  722  723  724  725 
# 1    1    2    4    2    2    4    2    2    1    4    1    4    4    4    4    4    4    3    1    1    3    3    4    1 
# 726  727  728  729  730  731  732  733  734  735  736  737  738  739  740  741  742  743  744  745  746  747  748  749  750 
# 4    1    3    1    4    4    1    3    1    1    4    1    1    1    2    1    1    2    2    2    4    2    1    3    1 
# 751  752  753  754  755  756  757  758  759  760  761  762  763  764  765  766  767  768  769  770  771  772  773  774  775 
# 4    1    4    1    4    3    2    1    4    1    3    2    1    4    3    2    4    1    4    2    3    3    4    1    2 
# 776  777  778  779  780  781  782  783  784  785  786  787  788  789  790  791  792  793  794  795  796  797  798  799  800 
# 4    4    4    1    4    4    1    3    4    1    3    1    1    1    1    1    3    1    1    2    2    2    4    4    4 
# 801  802  803  804  805  806  807  808  809  810  811  812  813  814  815  816  817  818  819  820  821  822  823  824  825 
# 4    4    4    3    4    2    4    4    1    4    4    4    4    1    1    1    3    4    1    1    3    1    4    4    4 
# 826  827  828  829  830  831  832  833  834  835  836  837  838  839  840  841  842  843  844  845  846  847  848  849  850 
# 3    1    4    4    1    4    4    4    1    1    4    3    1    4    1    3    3    4    4    4    4    3    4    2    4 
# 851  852  853  854  855  856  857  858  859  860  861  862  863  864  865  866  867  868  869  870  871  872  873  874  875 
# 1    4    4    4    4    4    3    4    4    4    4    4    4    4    4    4    1    4    4    4    1    4    4    4    4 
# 876  877  878  879  880  881  882  883  884  885  886  887  888  889  890  891  892  893  894  895  896  897  898  899  900 
# 4    2    1    3    3    4    4    1    4    4    4    3    4    4    2    4    4    3    3    3    3    2    4    4    1 
# 901  902  903  904  905  906  907  908  909  910  911  912  913  914  915  916  917  918  919  920  921  922  923  924  925 
# 1    4    3    1    4    3    3    1    3    4    4    4    3    1    4    2    2    2    2    2    2    2    4    3    3 
# 926  927  928  929  930  931  932  933  934  935  936  937  938  939  940  941  942  943  944  945  946  947  948  949  950 
# 3    1    4    3    4    2    2    2    2    2    2    2    2    4    3    1    4    1    3    3    1    1    1    1    4 
# 951  952  953  954  955  956  957  958  959  960  961  962  963  964  965  966  967  968  969  970  971  972  973  974  975 
# 1    4    3    3    4    4    3    3    1    2    3    4    4    4    4    4    4    4    2    4    3    4    1    2    4 
# 976  977  978  979  980  981  982  983  984  985  986  987  988  989  990  991  992  993  994  995  996  997  998  999 1000 
# 4    4    4    4    4    4    4    4    4    4    4    4    2    4    1    4    1    1    1    2    1    1    1    3    2 
# [ reached getOption("max.print") -- omitted 536577 entries ]
# 
# Within cluster sum of squares by cluster:
#   [1] 493145752071 124258692582 152898770978 314769926422
# (between_SS / total_SS =  91.9 %)
# 
# Available components:
#   
#   [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"         "iter"        
# [9] "ifault"

# 해석

# 군집의 수를 4개로 하여 분석을 실시했으며, 각 군집에 106191, 232372, 109575, 89439개로 군집이 묶였다.
# berween_SS / total_SS의 값이 91.9%로 나타나 군집이 매우 잘되었다고 판단할 수 있다.


# 원본데이터에 분류된 결과를 각 행에 맞게 라벨링하여 clust 변수로 저장하여 csv파일로 출력한다,

kmeans_clust <- kmeans_BF_4$cluster
BlackFriday_full <- cbind(BlackFriday, clust=kmeans_clust)
str(BlackFriday_full)

# 'data.frame':	537577 obs. of  14 variables:
#   $ User_ID                   : chr  "1000001" "1000001" "1000001" "1000001" ...
# $ Product_ID                : chr  "P00069042" "P00248942" "P00087842" "P00085442" ...
# $ Gender                    : chr  "F" "F" "F" "F" ...
# $ Age                       : chr  "0-17" "0-17" "0-17" "0-17" ...
# $ Occupation                : Factor w/ 21 levels "0","1","2","3",..: 11 11 11 11 17 16 8 8 8 21 ...
# $ City_Category             : chr  "A" "A" "A" "A" ...
# $ Stay_In_Current_City_Years: chr  "2" "2" "2" "2" ...
# $ Marital_Status            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 2 2 ...
# $ Product_Category_1        : Factor w/ 18 levels "1","2","3","4",..: 3 1 12 12 8 1 1 1 1 8 ...
# $ Product_Category_2        : Factor w/ 18 levels "0","2","3","4",..: 1 6 1 14 1 2 8 15 16 1 ...
# $ Product_Category_3        : Factor w/ 16 levels "0","3","4","5",..: 1 12 1 1 1 1 15 1 1 1 ...
# $ Purchase                  : int  8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...
# $ product_all               : num  3 21 12 26 8 3 26 16 17 8 ...
# $ clust                     : int  4 1 2 2 4 1 1 1 1 4 ...

# 군집의 개수를 4개로 하여 해당 cluster 분류를 열 결합하여 BlackFriday_full 데이터에 저장했다.
# 그리고 str 함수 구조를 파악해 clust 변수가 추가되었음을 확인하고 write.csv 함수로 해당 데이터를 csv 데이터로 출력했다.

# table 함수를 통해 군집 내 개수를 파악하고 xtabs 함수를 통해 군집별 특성을 파악한다,

table(BlackFriday_full$clust) # 군집내의 수는 2>3>1>4 순으로 많음
# 1      2      3      4 
# 109575  89439 106191 232372 

#Clust별 Gender 요약

xtabs(BlackFriday_full$clust ~ BlackFriday_full$Gender)

# F       M 
# 394576 1141938

xtabs(~BlackFriday_full$clust + BlackFriday_full$Gender)

#                       BlackFriday_full$Gender
# BlackFriday_full$clust      F      M
#                       1  21244  88331
#                       2  22650  66789
#                       3  25180  81011
#                       4  63123 169249

# 해석

# 먼저 clust 개수를 파악하기 위해 table 함수를 사용했다. 2>3>1>4번 군집 순으로 군집이 묶였음을 알 수 있다.
# clust별 Gender를 요약할 때, xtabs 함수를 활용했다. 전체의 성비를 확인한 결과, Female이 3094576이고
# Male이 1141938로 Male이 약 3배 이상 많음을 파악할 수 있다. clust별 Gender를 확인한 결과 모든 clust에서
# Male의 숫자가 높았으며, 비율의 차이가 가장 많이 나는 군집은 3번 군집으로 나타났다.