인류의 복지와 편익을 위한 인프라 건설을 주도하는토목공학과
제목
부천대학교 직무연수 빅데이터 분석 특강 주요 코딩
작성일
2021.07.08
작성자
부천대학교 토목과
# 1번, One sample t-test x=c(3.0, 1.5, 2.0, 1.0, 0.5, 0.3, 2.0, 1.0, 1.7, 0.3, 0.4) shapiro.test(x) t.test(x, mu=0.5, alter="greater", conf.level=0.97) # 2번, Two sample t-test edu_1=read.csv("edu_1.csv") head(edu_1) var.test(score~type, data=edu_1) t.test(score~type, var.equal=TRUE, data=edu_1) # 3번, Paired-T-test edu_2=read.csv("edu_2.csv") head(edu_2) with(edu_2, shapiro.test(after-before)) with(edu_2, t.test(after-before)) # 4번, 모비율에 대한 검정 : 한 모집단의 비율 prop.test(470,1000, p=0.50) # 5번, 모비율에 대한 검정 : 두 모집단의 비율 prop.test(c(120,150), c(200,350), alter="greater") # 6번, 적합성 검정 x=c(220,145,200,400,100) chisq.test(x,p=c(0.2,0.2,0.2,0.2,0.2)) # 7번, 독립성 검정 chisq.test(matrix(c(100,120,60,85,120,80,150,250,60,40),ncol=5)) # 8번, 상관분석 cor_1=read.csv("cor_bu.csv") head(cor_1) cor(cor_1) attach(cor_1) cor(time,score,method="spearman" ) cor(time,score,method="kendal" ) with(cor_1, cor.test(time, score)) # 9번, 단순회귀분석 edu_3=read.csv("cor_bu.csv") head(edu_3) out=lm(score~time, data=edu_3) plot(score~time, data=edu_3, col="red") abline(out, col="blue") summary(out) plot(out) new=data.frame(time=28) predict(out, new, interval = "confidence") predict(out, new, interval = "prediction") # 10번, 다중회귀분석 data=read.csv("bu_salary.csv") head(data) model=lm(salary~experience+score,data) summary(model) plot(model) predict(model,data.frame("experience"=8,"score"=90),interval = "confidence") predict(model,data.frame("experience"=8,"score"=90),interval = "prediction") # 11번, 분산분석 data=read.csv("movie.csv") head(data) levels(data$등급) install.packages("psych") library(psych) data=read.csv("movie.csv") describeBy(data$총관객수,group=data$등급,mat=TRUE) out=lm(log(총관객수)~등급, data) summary(out) install.packages("multcomp") library(multcomp) data$등급=relevel(data$등급,ref="청소년관람불가") dunnett=glht(out, linfct=mcp(등급="Dunnett")) summary(dunnett) install.packages("psych") library(psych) data=read.csv("movie.csv") out=lm(log(총관객수)~등급, data) plot(out) out2=lm(총관객수~등급, data) plot(out2) out=lm(log(총관객수)~등급, data) out2=lm(총관객수~등급, data) # 12번, 공분산분석 teaching=read.csv("teaching_method.csv") head(teaching) teaching$method=as.factor(teaching$method) teaching$method=relevel(teaching$method,ref="control") out=lm(after-before~before+method,teaching) anova(out) install.packages("multcomp") library(multcomp) dunnett=glht(out, linfct=mcp(method="Dunnett")) summary(dunnett) #13번, 더미변수와 회귀분석 inter=read.csv("inter.csv") head(inter, n=10) str(inter) inter$class=as.factor(inter$class) str(inter) model1=lm(score~grade+class+grade*class, inter) summary(model1) model2=lm(score~grade+class, inter) summary(model2) anova(model2,model1) model3=lm(score~grade, inter) summary(model3) anova(model3,model2) model4=lm(score~class, inter) summary(model4) anova(model2,model4) #14. 주성분분석 ranking=read.csv("company.csv") head(ranking) ranking$불량=with(ranking, max(불량)-불량) head(ranking) ranking=ranking[,-1] head(ranking) install.packages("psych") library(psych) pairs.panels(ranking) h_pca=prcomp(ranking[,1:4], scale=TRUE) summary(h_pca) plot(h_pca, type='l') plot(h_pca$x[,1],ranking$점수) cor(ranking$점수,h_pca$x[,1]) biplot(h_pca, choices=c(1,2)) #15. 탐색적 인자 분석 data(attitude) score=attitude head(score) score_1=score[,-1] head(score_1) fa1=factanal(score_1,2) print(fa1,digits=2,sort=T) install.packages("psych") library(psych) install.packages("GPArotation") library(GPArotation) scree(score_1, factors=FALSE) fa1=factanal(score_1,2) print(fa1,digits=2,sort=T) fa1_d=fa(score_1, nfactors=2, n.obs=N, rotate="varimax") fa.diagram(fa1_d) fa2=factanal(score_1, 2, rotation = "none") print(fa2, digits=2, sort=T) fa2_d=fa(score_1, nfactors=2, n.obs=N, rotate="none") fa.diagram(fa2_d) fa3=factanal(score_1,2,rotation="quartimax") print(fa3, digits=2, sort=T) fa3_d=fa(score_1, nfactors=2, n.obs=N, rotate="quartimax") fa.diagram(fa3_d) #16. 군집 분석 ranking=read.csv("company.csv") head(ranking) ranking_1=ranking[,-1] head(ranking_1) hc1=hclust(dist(ranking_1,method="euclidean"),method="single") plot(hc1) hc2=hclust(dist(ranking_1,method="euclidean"),method="complete") plot(hc2) hc3=hclust(dist(ranking_1,method="euclidean"),method="average") plot(hc3) hc4=hclust(dist(ranking_1,method="euclidean"),method="ward.D") plot(hc4) ranking_scale=scale(ranking_1) head(ranking_scale) km=kmeans(ranking_scale, centers=3) km$cluster ESS=c() for (k in 1:10){ km=kmeans(ranking_scale,k) ESS[k]=sum(km$withinss) } plot(ESS, type='l') install.packages("mclust") library(mclust) mc=Mclust(ranking_scale) summary(mc) plot(mc) km=kmeans(ranking_scale, centers=4) km$cluster #17. 판별분석 install.packages("ISLR") library(ISLR) head(Default,n=30) boxplot(balance~default, data=Default) boxplot(income~default, data=Default) glm.fit=glm(student~default+balance+income, data=Default, family=binomial) summary(glm.fit) install.packages("ROCR") library(ROCR) pred=data.frame(student=Default$student, fit=glm.fit$fitted) head(pred) predob=prediction(pred$fit, pred$student) a=performance(predob, "tpr", "fpr") plot(a) xtabs(~student + (fit>0.5), data=pred) #18. 네트워크분석 install.packages("igraph") library(igraph) sn=read.table(file.choose(), header=F) head(sn, n=30) sn.df=graph.data.frame(sn,directed=FALSE) plot(sn.df) sn1=subset(sn, sn$V1==50) head(sn1, n=20) sn1.df=graph.data.frame(sn1, directed=FALSE) plot(sn1.df) vcount(sn.df) ecount(sn.df) V(sn.df)$name vmax=V(sn.df)$name[degree(sn.df)==max(degree(sn.df))] vmax degree(sn.df,vmax) sn1=subset(sn, sn$V1==107) head(sn1, n=30) sn1.df=graph.data.frame(sn1, directed=FALSE) plot(sn1.df) summary(degree(sn.df)) plot(degree(sn.df),xlab="사용자 번호", ylab="연결정도",type='h') sn.df.dist=degree.distribution(sn.df) plot(sn.df.dist, xlab="연결정도", ylab="확률") degree(sn.df, normalized = TRUE) tmax=centralization.degree.tmax(sn.df) centralization.degree(sn.df, normalized=FALSE)$centralization/tmax closeness(sn.df, normalized = TRUE) centralization.closeness(sn.df,normalized = FALSE)$centralization/tmax betweenness(sn.df, normalized=TRUE) graph.density(sn.df) average.path.length(sn.df) sn15=subset(sn,sn$V1<=15 & sn$V2<=15) sn15.graph=graph.data.frame(sn15, directed=FALSE) shortest.paths(sn15.graph) get.shortest.paths(sn15.graph, "10") #19, 연관규칙탐사 install.packages("arules") library(arules) tr=read.delim("dataTransactions.tab", stringsAsFactors=FALSE) head(tr, n=20) tr.filter=subset(tr, subset=!(corner %in% c("일반식품","화장품"))) head(tr.filter, n=20) tr.filter.uniq=unique(tr.filter) head(tr.filter.uniq, n=20) trans=as(split(tr.filter.uniq$corner, tr.filter.uniq$custid), "transactions") trans image(trans[1:5]) transactionInfo(trans[size(trans)>15]) inspect(trans[1:2]) itemFrequencyPlot(trans, support=0.2, cex.names=0.8) rules=apriori(trans, parameter=list(support=0.2, confidence=0.8)) summary(rules) inspect(rules) rules.target=subset(rules, rhs %in% "스포츠" & lift>1.4) inspect(sort(rules.target, by="confidence")) write(rules.target, file= "C:/Users/pyh/Desktop/데이터/arules.txt", sep="\t", row.name=F) #20, 머신러닝 install.packages("nnet") library(nnet) cb=read.delim("Hshopping.txt", stringsAsFactors = FALSE) cb$반품여부=factor(cb$반품여부) install.packages("caret“, dependencies = TRUE) library(caret) set.seed(1) inTrain=createDataPartition(y=cb$반품여부, p=0.6, list=FALSE) cb.train=cb[inTrain,] cb.test=cb[-inTrain,] str(cb.train) set.seed(1234567) nn_model=nnet(반품여부~성별+나이+구매금액+출연자, data=cb.train, size=3, maxit=1000) cb.test$nn_pred=predict(nn_model, cb.test, type="class") cb.test$nn_pred_prob=predict(nn_model, cb.test, type="raw") head(cb.test, n=30) install.packages("devtools") library(devtools) source_url('https://gist.githubusercontent.com/Peque/41a9e20d6687f2f3108d/raw/85e14f3a292e126f1454864427e3a189c2fe33f3/nnet_plot_update.r') plot.nnet(nn_model) install.packages("NeuralNetTools") library(NeuralNetTools) garson(nn_model) install.packages("caret",dependencies = TRUE) : library(caret) cb.test$nn_pred=predict(nn_model, cb.test, type="class") cb.test$nn_pred=factor(cb.test$nn_pred) predicted=as.factor(predict(nn_model, newdata=cb.test, type="class")) confusionMatrix(predicted,cb.test$반품여부) install.packages("ROCR") library(ROCR) nn_pred=prediction(predict(nn_model, newdata=cb.test, type="raw"), cb.test$반품여부) nn_model.perf1=performance(nn_pred, "tpr", "fpr") plot(nn_model.perf1, colorize=TRUE) install.packages("caret") install.packages("C50") install.packages("ROCR") install.packages('e1071', dependencies=TRUE) library(caret) library(C50) library(ROCR) library(e1071) cb=read.delim("Hshopping.txt", stringsAsFactors=FALSE) head(cb) str(cb) cb$반품여부=factor(cb$반품여부) set.seed(1) inTrain=createDataPartition(y=cb$반품여부, p=0.6, list=FALSE) cb.train=cb[inTrain,] cb.test=cb[-inTrain,] dim(cb.train); dim(cb.test) c5_options =C5.0Control(winnow = FALSE, noGlobalPruning= FALSE) c5_model =C5.0(반품여부~ 성별+나이+구매금액+출연자, data=cb.train, control=c5_options, rules=FALSE) summary(c5_model) plot(c5_model) cb.test$c5_pred =predict(c5_model, cb.test, type="class") cb.test$c5_pred_prob =predict(c5_model, cb.test, type="prob") head(cb.test) confusionMatrix(cb.test$c5_pred, cb.test$반품여부) c5_pred =prediction(cb.test$c5_pred_prob[,2],cb.test$반품여부) c5_model.perf1 =performance(c5_pred, "tpr", "fpr") # ROC-chart c5_model.perf2 =performance(c5_pred, "lift", "rpp") # Lift chart plot(c5_model.perf1, colorize=TRUE) plot(c5_model.perf2, colorize=TRUE) install.packages("randomForest") library(randomForest) rf_model=randomForest(반품여부~성별+나이+구매금액+출연자, data=cb.train, ntree=10) rf_model plot(rf_model) varImpPlot(rf_model) cb.test$rf_pred=predict(rf_model, cb.test, type="response") cb.test$rf_pred_prob=predict(rf_model, cb.test, type="prob") confusionMatrix(cb.test$rf_pred, cb.test$반품여부) rf_pred=prediction(cb.test$rf_pred_prob[,2],cb.test$반품여부) rf_model.perf1 =performance(rf_pred, "tpr", "fpr") # ROC-chart plot(rf_model.perf1, colorize=TRUE) performance(rf_pred, "auc")@y.values[[1]]