R——世界杯数据可视化

R——世界杯数据可视化

作为全球最受欢迎的体育运动,自然会吸引全世界无数球迷的目光。本文将对世界杯历史数据进行可视化分析。数据集是FIFA官方数据整理的基础数据表,本文数据集来源于天池。本文将对表WorldCupsSummary(世界杯成绩信息表)进行数据可视化。该数据集包含了1930-2018年共21届世界杯赛事汇总信息,涉及的信息可见“变量介绍”部分。

一 变量介绍

Year:举办年份

HostCountry:举办国家

Winner、Second、Third、Fourth:获奖名次。

GoalsScored:总进球数

QualifiedTeams:总参赛队伍数

MatchesPlayed:总比赛场数

Attendance:现场观众总数

HostContinent:举办国所在洲

WinnerContinent:冠军国家所在洲

二 分析问题

举办世界杯次数最多的国家

举办世界杯次数最多的洲

计算各举办国在所属洲的占比

夺冠次数最多的国家及其所在洲分布

世界杯半决赛结果

经常打入半决赛、决赛的队伍,及其夺冠概率

东道主进入半决赛、决赛、夺冠的概率

历届世界杯总进球数变化趋势

历届世界杯现场观看人数变化趋势、年平均现场观看人数

历届世界杯总参赛队伍变化趋势

历届世界杯总比赛场数变化趋势

三 数据分析

1 导入数据

WorldCupsSummary <- read.csv("WorldCupsSummary.csv",header=TRUE)

2 数据预处理

# 将Germany FR替换为Germany

WorldCupsSummary$Winner[WorldCupsSummary$Winner=="Germany FR"] <- "Germany"

WorldCupsSummary$Second[WorldCupsSummary$Second=="Germany FR"] <- "Germany"

WorldCupsSummary$Third[WorldCupsSummary$Third=="Germany FR"] <- "Germany"

WorldCupsSummary$Fourth[WorldCupsSummary$Fourth=="Germany FR"] <- "Germany"

3 数据分析

3.1 举办世界杯次数最多的国家

f_hostcountry <- data.frame(table(WorldCupsSummary$HostCountry),stringsAsFactors = F)

names(f_hostcountry)[1] <- "HostCountry"

f_hostcountry <- f_hostcountry[order(f_hostcountry$Freq),]

b1 <- WorldCupsSummary[,c(1,2)]

b1 <- merge(f_hostcountry,b1)

b2 <- b1[b1$Freq != 1,]

b3 <- data.frame(HostCountry=character(),stringsAsFactors = F) #创建空白数据框

for (i in unique(b2$HostCountry)) {

k4 <- b2[b2$HostCountry==i,c(1,2,3)]

k4$Year <- paste0(k4$Year[1],"& ",k4$Year[2])

b3 <- rbind(b3,k4[duplicated(k4),])

}

f_hostcountry <- rbind(b1[b1$Freq==1,],b3)

ggplot(f_hostcountry,aes(x=reorder(HostCountry,Freq),y=Freq,fill=HostCountry))+

geom_bar(stat="identity")+

scale_y_continuous(breaks=c(1,2))+

labs(title="举办世界杯次数最多的国家",x="举办国家",y="举办次数")+

theme(panel.grid.major.y=element_blank(),

panel.grid.minor=element_blank(),

plot.title=element_text(hjust=0.5,size=16,face="bold"))+

geom_text(aes(x=HostCountry,y=0.05,label=paste("举办时间:",Year,"年",seq=""),hjust=0))+ #hjust=0左对齐

coord_flip() #交换坐标轴

3.2 举办世界杯次数最多的洲

f_hostContinent <- data.frame(table(WorldCupsSummary$HostContinent))

names(f_hostContinent)[1] <- "HostContinent"

f_hostContinent <- mutate(f_hostContinent,Percentage=round(Freq/sum(Freq)*100,2))

layout(matrix(1:2,1,2,byrow=T),widths=c(1,1.5))

bar2 <- barplot(f_hostContinent$Freq,

col=brewer.pal(4,"Set3"),axes=F,cex.main=1.5,cex.names=0.55,

main="四大洲举办世界杯次数",xlab="四大洲",ylab="举办世界杯次数")

axis(side=2,at=seq(0,13,1),labels=seq(0,13,1),las=1)

axis(side=1,at=seq(0.7,4.3,1.2),labels=c("Africa","America","Asia","Europe"),

tick=F) #是否绘制刻度线和轴线

text(x=bar2,y=f_hostContinent$Freq-0.5,labels=f_hostContinent$Freq,cex=1,font=2)

pie(f_hostContinent$Percentage,labels=paste0(f_hostContinent$HostContinent,":",f_hostContinent$Percentage,"%"),

col=brewer.pal(4,"Set3"),cex.main=1.5,

main="四大洲举办世界杯次数占比")

3.3 计算各举办国家在所属洲的占比(America、Europe洲)

f_host <- xtabs(~WorldCupsSummary$HostContinent+WorldCupsSummary$HostCountry)

f_host <- round((f_host/rowSums(f_host))*100,2)

f_host <- data.frame(f_host)

names(f_host)[1] <- "HostContinent"

names(f_host)[2] <- "HostCountry"

m1 <- subset(f_host,HostContinent=="America" & Freq != 0.00,select=c(HostCountry,Freq))

blank_theme <- theme_minimal()+ #删除背景色

theme(axis.title.x = element_blank(),

axis.title.y = element_blank(),

panel.border = element_blank(),

panel.grid=element_blank(),

axis.ticks = element_blank(),

plot.title=element_text(size=14, face="bold",hjust=0.5))

pie1 <- ggplot(m1,aes(x="",y=Freq,fill=HostCountry))+

geom_bar(stat="identity",width=1,position=position_stack(reverse =T))+

coord_polar(theta="y", start=0)+

scale_fill_brewer(palette="Set3")+

blank_theme+

theme(axis.text.x=element_blank(),

legend.position="none") +

geom_text(aes(x=1.2,label=paste0(m1$HostCountry,"\n",m1[,2],"%")),

position=position_stack(reverse =T,vjust=0.5),size=5)+

labs(title="世界杯在America洲的具体分布",plot.title=element_text(hjust=0.5))

m2 <- subset(f_host,HostContinent=="Europe" & Freq != 0.00,select=c(HostCountry,Freq))

pie2 <- ggplot(m2,aes(x="",y=Freq,fill=HostCountry))+

geom_bar(stat="identity",width=1,position=position_stack(reverse =T))+

coord_polar(theta="y", start=0)+

scale_fill_brewer(palette="Set3")+

blank_theme+

theme(axis.text.x=element_blank(),

legend.position="none") +

geom_text(aes(x=1.2,label=paste0(m2$HostCountry,"\n",m2[,2],"%")),

position=position_stack(reverse =T,vjust=0.5),size=5)+

labs(title="世界杯在Europe洲的具体分布",plot.title=element_text(hjust=0.5))

grid.arrange(pie1,pie2,ncol=2)

3.4 夺冠次数最多的国家及其所在洲分布

attach(WorldCupsSummary)

win_con <- data.frame(xtabs(~WinnerContinent+Winner)) #根据洲、冠军队伍汇总

win_con <- win_con[win_con$Freq!=0,]

win_con$WinnerContinent <- factor(win_con$WinnerContinent)

sum_wincon <- data.frame(table(WinnerContinent)) #冠军所在洲的汇总

sum_wincon$p_freq <- c(sum_wincon$Freq[1]/(sum_wincon$Freq[1]+sum_wincon$Freq[2]),

sum_wincon$Freq[2]/(sum_wincon$Freq[1]+sum_wincon$Freq[2]))

bar3 <- ggplot(sum_wincon,aes(x="",y=Freq,fill=WinnerContinent))+

geom_bar(stat="identity",width=1,position=position_stack(reverse =T))+

coord_polar(theta="y",start=0)+

geom_text(aes(x=1,label=paste(WinnerContinent,":",round(p_freq*100,2),"%",seq="")),

position=position_stack(reverse =T,vjust=0.5),size=5)+

blank_theme+

theme(axis.text.x=element_blank(),

legend.position="none")

bar4 <- ggplot(win_con,aes(x=Winner,y=Freq,fill=WinnerContinent))+

geom_bar(stat="identity")+

labs(title="夺冠队伍的具体分布",x="",y="夺冠次数")+

theme(axis.text.x=element_text(angle=330,vjus=1,size=10),

axis.text.y=element_text(size=10),

plot.title=element_text(hjust=0.5,face="bold",size=13),

panel.grid.major=element_blank(),

panel.grid.minor=element_blank(),

panel.background = element_blank(), #去掉背景颜色

axis.line.y=element_line(colour = "black")) #显示坐标轴线

grid.arrange(bar3,bar4,ncol=2)

detach(WorldCupsSummary)

3.5 世界杯半决赛结果

freq_all <- apply(WorldCupsSummary[,3:6],2,table) #生成第一-四名的频数列表

result_freq <- data.frame(Country=character(),stringsAsFactors = FALSE) #创建空白数据框

for (i in names(freq_all)) {

df <- data.frame(freq_all[[i]])

names(df)[1] <- c("Country")

names(df)[2] <- i

result_freq <- full_join(result_freq,df,by="Country") #将生成结果填充在数据框result_freq中

} #生成第一至四名的频数数据框

result_freq[is.na(result_freq)] <- 0 #填充整个数据框中存在缺失的单元格

3.5.1 冠亚军分布

win_second <- subset(result_freq,Winner!=0 | Second!=0,select=c(1,2,3)) #提取仅含冠亚军的数据

win_second <- gather(win_second,key="rank",value="count",-Country) #宽数据转换为长数据

win_second <- arrange(win_second,-count) #根据count列降序

ggplot(win_second,aes(x=reorder(Country,-count),y=count,fill=rank))+

geom_bar(stat="identity",position="dodge")+

labs(title="21届世界杯比赛中获得冠亚军的国家次数统计",x="国家",y="次数")+

theme(axis.text.x = element_text(angle=45),

plot.title=element_text(hjust=0.5,vjust=0.5,size=18,face="bold"))

3.5.2 第三四名分布

third_fourth <- subset(result_freq,Third!=0 | Fourth!=0,select=c(1,4,5))

third_fourth <- gather(third_fourth,key="rank",value="count",-Country)

third_fourth <- arrange(third_fourth,-count)

ggplot(third_fourth,aes(x=reorder(Country,count),y=count,fill=rank))+

geom_bar(stat="identity",position="dodge")+

labs(title="21届世界杯比赛中获得第三四名的国家次数统计",x="国家",y="次数")+

coord_flip()+

theme(plot.title=element_text(hjust=0.5,face="bold"))

3.6 经常打入半决赛、决赛的队伍,及其夺冠概率

3.6.1 进入半决赛的次数

result_freq$semi <- apply(result_freq[2:5],1,sum)

result_freq$final <- apply(result_freq[2:3],1,sum)

m3 <- result_freq[result_freq$semi!=0,]

ggplot(m3,aes(x=Country,y=semi,fill=semi))+

geom_bar(stat="identity")+

geom_text(aes(y=semi+0.25,label=semi,size=4))+

labs(title="进入半决赛的次数",x="国家",y="次数")+

theme(plot.title=element_text(hjust=0.5,size=16,face="bold"),

axis.text.x=element_text(size=13,angle=45,vjust=0.8),

axis.text.y=element_text(size=13),

legend.position = "none")

3.6.2 进入决赛的次数

m4 <- result_freq[result_freq$final!=0,]

ggplot(m4,aes(x=Country,y=final,fill=final))+

geom_bar(stat="identity")+

geom_text(aes(y=final+0.25,label=final,size=4))+

labs(title="进入决赛的次数",x="国家",y="次数")+

theme(plot.title=element_text(hjust=0.5,size=16,face="bold"),

axis.text.x=element_text(size=13,angle=45,vjust=0.8),

axis.text.y=element_text(size=13),

legend.position = "none")

3.6.3 进入决赛后夺冠的概率

result_freq$p_win <- round((result_freq$Winner/result_freq$final)*100,2)

result_freq_pwin <- result_freq[!is.na(result_freq$p_win),]

ggplot(result_freq_pwin,aes(x=Country,y=p_win))+

geom_bar(stat="identity",fill="#6595A3")+

geom_text(aes(y=p_win+2),label=paste(result_freq_pwin$p_win,"%",seq=""),size=4)+

labs(title="进入决赛之后夺冠概率",y="夺冠概率(单位:“%)")+

theme(plot.title=element_text(hjust=0.5,size=13,face="bold"),

axis.text.y=element_text(size=13),

axis.text.x=element_text(size=13,angle=45))

3.7 东道主进入半决赛、决赛、夺冠的概率

3.7.1 东道主进入半决赛的概率

p_hostcountry <- WorldCupsSummary[,2:6]

p_hostcountry[1,]

for (i in 1:length(p_hostcountry$HostCountry)) {

if (p_hostcountry$HostCountry[i] %in% p_hostcountry[i,c(2,3,4,5)]) {

p_hostcountry$semi[i] <- 1

} else {p_hostcountry$semi[i] <- 0}

}

layout(matrix(1:2,1,2,byrow=T))

pie(table(p_hostcountry$semi),col=c("#E64B35CC","#4DBBD5CC"),

labels=c("未进入半决赛:42.86%","进入半决赛:57.14%"),

main="东道主进入四强的概率")

barplot(table(p_hostcountry$semi),col=c("#E64B35CC","#4DBBD5CC"),axes = F,

ylab="次数")

axis(side=2,at=seq(0,12,1),las=2)

3.7.2 东道主进入决赛的概率

for (i in 1:length(p_hostcountry$HostCountry)) {

if (p_hostcountry$HostCountry[i] %in% p_hostcountry[i,c(2,3)]) {

p_hostcountry$final[i] <- 1

} else {p_hostcountry$final[i] <- 0}

}

layout(matrix(1:2,1,2,byrow=T))

pie(table(p_hostcountry$final),col=c("#E64B35CC","#4DBBD5CC"),

labels=c("未进入决赛:61.9%","进入决赛:38.1%"),

main="东道主进入决赛的概率")

barplot(table(p_hostcountry$final),col=c("#E64B35CC","#4DBBD5CC"),axes = F,

ylab="次数")

axis(side=2,at=seq(0,13,1),las=2)

3.7.3 东道主夺冠的概率

for (i in 1:length(p_hostcountry$HostCountry)) {

if (p_hostcountry$HostCountry[i]==p_hostcountry$Winner[i]) {

p_hostcountry$win[i] <- 1

} else {p_hostcountry$win[i] <- 0}

}

layout(matrix(1:2,1,2,byrow=T))

pie(table(p_hostcountry$win),col=c("#E64B35CC","#4DBBD5CC"),

labels=c("未夺冠:71.43%","夺冠:28.57%"),

main="东道主夺冠的概率")

barplot(table(p_hostcountry$win),col=c("#E64B35CC","#4DBBD5CC"),axes = F,

ylab="次数")

axis(side=2,at=seq(0,15,1),las=2)

3.8 历届世界杯总进球数变化趋势

t1 <- c(1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,1978,1982,1986,1990,

1994,1998,2002,2006,2010,2014,2018)

plot(WorldCupsSummary$Year,WorldCupsSummary$GoalsScored,type="b",

pch=16,lty=1,lwd=2,xaxt="n",yaxt="n",

main="1930-2018年总进球数变化情况",xlab="年份",ylab="总进球数")

axis(side=1,at=t1,labels=t1)

axis(side=2,at=seq(70,180,10),las=2)

3.9 历届世界杯现场观看人数变化趋势、年平均现场观看人数

#历届世界杯现场观看人数变化趋势

attendence_avg_all <- subset(WorldCupsSummary,select=c(Year,MatchesPlayed,Attendance))

attendence_avg_all <- mutate(attendence_avg_all,avg_attendance=Attendance/MatchesPlayed)

plot(attendence_avg_all$Year,attendence_avg_all$Attendance,type="b",

pch=19,lty=4,lwd=2,col="blue",xaxt="n",yaxt="n",

main="1930-2018年现场观众变化情况",xlab="年份",ylab="现场观众总人数(单位:万)")

axis(side=1,at=t1,labels=t1)

axis(side=2,at=seq(360000,3600000,360000),labels=seq(3.6,36,3.6),las=2)

#年平均现场观看人数

plot(attendence_avg_all$Year,attendence_avg_all$avg_attendance,type="b",

pch=16,lty=2,lwd=2,col="red",xaxt="n",yaxt="n",ylab="",xlab="")

axis(side=4,at=seq(20000,75000,5000),labels=seq(2,7.5,0.5),las=2)

mtext(side=4,line=2.5,"年均现场观看人数(单位:万")

legend("topleft",inset=0.01,title="现场观众人数",c("总人数","年平均人数"),lty=c(4,2),

pch=c(19,16),col=c("blue","red"))

3.10 历届世界杯总参赛队伍变化趋势

plot(WorldCupsSummary$Year,WorldCupsSummary$QualifiedTeams,type="b",

pch=17,cex=1.5,lty=2,lwd=2,xaxt="n",yaxt="n",

main="1930-2018年总参赛队伍数变化情况",xlab="年份",ylab="总参赛队伍数")

axis(side=1,at=t1,labels=t1)

axis(side=2,at=seq(13,35,1),las=2)

3.11 历届世界杯总比赛场数变化趋势

plot(WorldCupsSummary$Year,WorldCupsSummary$MatchesPlayed,type="b",

pch=18,lty=3,lwd=2,xaxt="n",yaxt="n",

main="1930-2018年总比赛场数变化情况",xlab="年份",ylab="总比赛场数")

axis(side=1,at=t1,labels=t1)

axis(side=2,at=seq(17,67,5),las=2)

相关推荐

为什么手机QQ网络连接失败
中国的365体育投注

为什么手机QQ网络连接失败

06-27 👁️ 8429
QQ飞车手游炫光怎么获得
假的网站365怎么看

QQ飞车手游炫光怎么获得

06-27 👁️ 7259
汐玥名字的含义寓意 汐玥名字的意思五行
中国的365体育投注

汐玥名字的含义寓意 汐玥名字的意思五行

06-27 👁️ 5614