-
#
层次聚类
Data=iris[,-5];
Means=sapply
(data,mean);SD=sapply(data,sd);
dataSca
le=scale(data,center=means,scale=SD);
D
ist=dist(dataScale,method=
”
e
uclidean
”
);
heatm
ap((Dist),labRow=FALSE,labCol=FALSE);
c
lusteModel=hclust(Dist,method=
”
< br>ward
”
);
result=cutree(clusteModel,k=3);
table(iris[,5],result);
plot(clusteModel);
library(fastcluster); # kuaisu
cengcijulei
clusteModel=hcl
ust(Dist,method=
”
ward
”
);
library(proxy);
res=dist(dat
a,method=
”
cosine
”
);
x=c(0,0,1,1,1,1);
y=c(1,0,1,1,0,1);
dist(rbind
(x,y),method=
”
Jaccard
”
);
x=c(0,0,1.2,1,0.5,1,NA);
y=c(1,0,2.3,1,0.9,1,1);
d=abs(x-y);
Dist=sum(d[!(d)])/6;
# k-means
聚类
clusteModel=kmeans(dataScale,centers=3,nstar t=10);
class(clusteModel);
library(proxy);
library(cluster);
clustModel
=pam(dataScale,k=3,metric=
”
M
ahalanobis
”
);
clustModel$$medoids
< br>table(iris$$Species,clustModel$$clustering);
par(mfcol=c(1,2));
< br>plot(clustModel,=2,main=
””
);
Plot(clustModel,=1,main=
””
);
library(devtools);
install_g
ithub(
“
lijian13/rinds
”
);
rinds::bestCluster(dataScale,2:6);
library(fpc);
pk
a=kmeansruns(iris[,1:4],krange=2:6,critout=TRUE,ru
ns=2,criterion=
”
asw
”
);
#
基于密度的聚类
x1=seq(0,pi,=100);
y1=sin(x1)+0.1*rnorm(100);
x2=1.5+seq(0,pi,=100);
y2=cos(x2)+0.1*rnorm(100);
data=(c(x1,x2),c(y1,y2));
na
mes(data)=c(
“
x
”
p>
,
”
y
”
);
model1=kmeans(data,centers=2,
nstart=10);
library(
“
fpc
”
);
model2=dbscan(data,eps=0.3,MinPts=4);
#
自组织映射
library(kohonen);
data=(iris[,-5]);
somModel=s
om(data,grid=somgrid(15,10,
”
hexagonal
”
));
plo
t(somModel,ncolors=10,type=
”
ours
”
);
irisclass=c(iris[,5]);
plot(
somModel,type=
”
mapping
”
,labels=irisclass,col=irisclass+3
,main=
”
mapping
plot
”
);
#
主成分分析
library(FactoMineR);
data(decathlon);head(decathlon,n=2);
pca1=princomp(decathlon[,1:10]);
plot(pca1,type=
=PCA(decathlon,=11:
12,=13);
#
对应分析
library(MASS);data(caith);
b
iplot(corresp(caith,nf=2),xlim=c(-0.6,0.8));
#
多元分析的可视化
library(car);
data(mpg,packa
ge=
scatterplotMatrix(mpg[,c('displ',
E);
library(corrplot);data(mtcars);
M=cor(mtcars);
corrplot(M,order=
#Logistic
回归
(1);
b0=1;b1=2;b2=3;
x1=rnorm(1000);x2=rnorm(1000);
z=b0+b1*x1+b2*x2;
pr=1/(1+exp(-z));
y=rbinom(1000,1,pr);
plotdata2=(x1,x2,y=factor(y));
library(ggplot2);
p2=ggplot(
data=plotdata2,aes(x=x1,y=x2,color=y))+geom_point(
);
print(p2);
data=(x1,x2,y);
model=glm(y~
.,data=data,family=
summary(model);
w=model$$coef;
inter=-w[1]/w[3];
slope=-w[2]/w[3];
plotdata3=(cbind(x1,x2),y=factor(y)); <
/p>
p3=ggplot(data=plotdata3,aes(x=x1,y=x2,c
olor=y))+geom_point()+geom_abline(inter
cept=inter,slope=slope);
print(p3);
predict(model,newdata=list(
x1=1,x2=3),type=
#
复杂网络
snafile=(
snadf=(snafile,header=FALSE,st
ringsAsFactors=FALSE);
head(snadf)
library(igraph);
snaobj=(snadf,directed=FALSE);
class(snaobj)
vcount(snaobj);
ecount(snaobj);
neighbors(snaobj,6,mode=
degree(snaobj,v=6);
betweenness(snaobj,v=6,directed=FALSE);
closeness(snaobj,v=6);
(snaobj,vids=6);
(snaobj,vids=c(6,7));
snaclass=ity(snaobj,steps=5);
cl=snaclass$$membership;
V(snaobj)$$color=rainbow(max(cl))[cl];
V(snaobj)$$bte=betweenness(s
naobj,directed=FALSE);
V(snaobj)$$size=5;
V(snaobj)[bte>=1800]$$size=15;
V(snaobj)$$label=NA;
V(snaobj)[bte>=1800]$$label=
V(snaobj)[bte>=1800]$$name;
plot(snaobj,layout=ld,=
V(snaobj)$$size,=
V(snaobj)$$color,=
V(snaobj)$$label,=
V(snaobj)$$cex,=grey(0.5),=
用
p>
caret
包对数据清洗并进行回归树预测
(1)
data(PimaIndiansDiabetes
2,package='mlbench')
data=PimaIndiansDiabetes2
library(caret)
library(caret)
p
reProcV
alues=preProcess(data[,-9],metho
d=c('center','scale'))
scaleddata=predi
ct(preProcV
alues,data[,-9])
preProcbox=preProcess(scaleddata,method
=c('YeoJohnson'))
boxdata=predict(preProcbox,scaleddata)
preProcimp=preProcess(boxda
ta,method='bagImpute')
procdata=predict(preProcimp,boxdata)
procdata$$class=data[,9]
library(rpart)
rpartModel=rp
art(class~.,data=procdata,control=l(cp=0))
cptable=(rpartModel$$cptable)
cptable$$errsd=cptable$$xerror+cptable$$xstd
cpvalue=cptable[(cptable$$errsd),'CP']
pruneModel=prune(rpartModel,cpvalue)
library()
(pruneModel)
pre
=predict(pruneModel,procdata,type='class')
preTable=table(pre,procdata$$class)
accuracy=sum(diag(preTable))/sum(preTable)
(iris,file='C:/Program
Files/R/',sep=',')
data=(file='C:/Program
Files/R/',sep=',')
(procdata,file='C:/Program
Files/R/',sep=',')
procdata=(file='C:/Program
Files/R/',sep=',')