作为全球最受欢迎的体育运动,自然会吸引全世界无数球迷的目光。本文将对世界杯历史数据进行可视化分析。数据集是FIFA官方数据整理的基础数据表,本文数据集来源于天池。本文将对表WorldCupsSummary(世界杯成绩信息表)进行数据可视化。该数据集包含了1930-2018年共21届世界杯赛事汇总信息,涉及的信息可见“变量介绍”部分。
一 变量介绍
Year:举办年份
HostCountry:举办国家
Winner、Second、Third、Fourth:获奖名次。
GoalsScored:总进球数
QualifiedTeams:总参赛队伍数
MatchesPlayed:总比赛场数
Attendance:现场观众总数
HostContinent:举办国所在洲
WinnerContinent:冠军国家所在洲
二 分析问题
举办世界杯次数最多的国家
举办世界杯次数最多的洲
计算各举办国在所属洲的占比
夺冠次数最多的国家及其所在洲分布
世界杯半决赛结果
经常打入半决赛、决赛的队伍,及其夺冠概率
东道主进入半决赛、决赛、夺冠的概率
历届世界杯总进球数变化趋势
历届世界杯现场观看人数变化趋势、年平均现场观看人数
历届世界杯总参赛队伍变化趋势
历届世界杯总比赛场数变化趋势
三 数据分析
1 导入数据
WorldCupsSummary <- read.csv("WorldCupsSummary.csv",header=TRUE)
2 数据预处理
# 将Germany FR替换为Germany
WorldCupsSummary$Winner[WorldCupsSummary$Winner=="Germany FR"] <- "Germany"
WorldCupsSummary$Second[WorldCupsSummary$Second=="Germany FR"] <- "Germany"
WorldCupsSummary$Third[WorldCupsSummary$Third=="Germany FR"] <- "Germany"
WorldCupsSummary$Fourth[WorldCupsSummary$Fourth=="Germany FR"] <- "Germany"
3 数据分析
3.1 举办世界杯次数最多的国家
f_hostcountry <- data.frame(table(WorldCupsSummary$HostCountry),stringsAsFactors = F)
names(f_hostcountry)[1] <- "HostCountry"
f_hostcountry <- f_hostcountry[order(f_hostcountry$Freq),]
b1 <- WorldCupsSummary[,c(1,2)]
b1 <- merge(f_hostcountry,b1)
b2 <- b1[b1$Freq != 1,]
b3 <- data.frame(HostCountry=character(),stringsAsFactors = F) #创建空白数据框
for (i in unique(b2$HostCountry)) {
k4 <- b2[b2$HostCountry==i,c(1,2,3)]
k4$Year <- paste0(k4$Year[1],"& ",k4$Year[2])
b3 <- rbind(b3,k4[duplicated(k4),])
}
f_hostcountry <- rbind(b1[b1$Freq==1,],b3)
ggplot(f_hostcountry,aes(x=reorder(HostCountry,Freq),y=Freq,fill=HostCountry))+
geom_bar(stat="identity")+
scale_y_continuous(breaks=c(1,2))+
labs(title="举办世界杯次数最多的国家",x="举办国家",y="举办次数")+
theme(panel.grid.major.y=element_blank(),
panel.grid.minor=element_blank(),
plot.title=element_text(hjust=0.5,size=16,face="bold"))+
geom_text(aes(x=HostCountry,y=0.05,label=paste("举办时间:",Year,"年",seq=""),hjust=0))+ #hjust=0左对齐
coord_flip() #交换坐标轴
3.2 举办世界杯次数最多的洲
f_hostContinent <- data.frame(table(WorldCupsSummary$HostContinent))
names(f_hostContinent)[1] <- "HostContinent"
f_hostContinent <- mutate(f_hostContinent,Percentage=round(Freq/sum(Freq)*100,2))
layout(matrix(1:2,1,2,byrow=T),widths=c(1,1.5))
bar2 <- barplot(f_hostContinent$Freq,
col=brewer.pal(4,"Set3"),axes=F,cex.main=1.5,cex.names=0.55,
main="四大洲举办世界杯次数",xlab="四大洲",ylab="举办世界杯次数")
axis(side=2,at=seq(0,13,1),labels=seq(0,13,1),las=1)
axis(side=1,at=seq(0.7,4.3,1.2),labels=c("Africa","America","Asia","Europe"),
tick=F) #是否绘制刻度线和轴线
text(x=bar2,y=f_hostContinent$Freq-0.5,labels=f_hostContinent$Freq,cex=1,font=2)
pie(f_hostContinent$Percentage,labels=paste0(f_hostContinent$HostContinent,":",f_hostContinent$Percentage,"%"),
col=brewer.pal(4,"Set3"),cex.main=1.5,
main="四大洲举办世界杯次数占比")
3.3 计算各举办国家在所属洲的占比(America、Europe洲)
f_host <- xtabs(~WorldCupsSummary$HostContinent+WorldCupsSummary$HostCountry)
f_host <- round((f_host/rowSums(f_host))*100,2)
f_host <- data.frame(f_host)
names(f_host)[1] <- "HostContinent"
names(f_host)[2] <- "HostCountry"
m1 <- subset(f_host,HostContinent=="America" & Freq != 0.00,select=c(HostCountry,Freq))
blank_theme <- theme_minimal()+ #删除背景色
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold",hjust=0.5))
pie1 <- ggplot(m1,aes(x="",y=Freq,fill=HostCountry))+
geom_bar(stat="identity",width=1,position=position_stack(reverse =T))+
coord_polar(theta="y", start=0)+
scale_fill_brewer(palette="Set3")+
blank_theme+
theme(axis.text.x=element_blank(),
legend.position="none") +
geom_text(aes(x=1.2,label=paste0(m1$HostCountry,"\n",m1[,2],"%")),
position=position_stack(reverse =T,vjust=0.5),size=5)+
labs(title="世界杯在America洲的具体分布",plot.title=element_text(hjust=0.5))
m2 <- subset(f_host,HostContinent=="Europe" & Freq != 0.00,select=c(HostCountry,Freq))
pie2 <- ggplot(m2,aes(x="",y=Freq,fill=HostCountry))+
geom_bar(stat="identity",width=1,position=position_stack(reverse =T))+
coord_polar(theta="y", start=0)+
scale_fill_brewer(palette="Set3")+
blank_theme+
theme(axis.text.x=element_blank(),
legend.position="none") +
geom_text(aes(x=1.2,label=paste0(m2$HostCountry,"\n",m2[,2],"%")),
position=position_stack(reverse =T,vjust=0.5),size=5)+
labs(title="世界杯在Europe洲的具体分布",plot.title=element_text(hjust=0.5))
grid.arrange(pie1,pie2,ncol=2)
3.4 夺冠次数最多的国家及其所在洲分布
attach(WorldCupsSummary)
win_con <- data.frame(xtabs(~WinnerContinent+Winner)) #根据洲、冠军队伍汇总
win_con <- win_con[win_con$Freq!=0,]
win_con$WinnerContinent <- factor(win_con$WinnerContinent)
sum_wincon <- data.frame(table(WinnerContinent)) #冠军所在洲的汇总
sum_wincon$p_freq <- c(sum_wincon$Freq[1]/(sum_wincon$Freq[1]+sum_wincon$Freq[2]),
sum_wincon$Freq[2]/(sum_wincon$Freq[1]+sum_wincon$Freq[2]))
bar3 <- ggplot(sum_wincon,aes(x="",y=Freq,fill=WinnerContinent))+
geom_bar(stat="identity",width=1,position=position_stack(reverse =T))+
coord_polar(theta="y",start=0)+
geom_text(aes(x=1,label=paste(WinnerContinent,":",round(p_freq*100,2),"%",seq="")),
position=position_stack(reverse =T,vjust=0.5),size=5)+
blank_theme+
theme(axis.text.x=element_blank(),
legend.position="none")
bar4 <- ggplot(win_con,aes(x=Winner,y=Freq,fill=WinnerContinent))+
geom_bar(stat="identity")+
labs(title="夺冠队伍的具体分布",x="",y="夺冠次数")+
theme(axis.text.x=element_text(angle=330,vjus=1,size=10),
axis.text.y=element_text(size=10),
plot.title=element_text(hjust=0.5,face="bold",size=13),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.background = element_blank(), #去掉背景颜色
axis.line.y=element_line(colour = "black")) #显示坐标轴线
grid.arrange(bar3,bar4,ncol=2)
detach(WorldCupsSummary)
3.5 世界杯半决赛结果
freq_all <- apply(WorldCupsSummary[,3:6],2,table) #生成第一-四名的频数列表
result_freq <- data.frame(Country=character(),stringsAsFactors = FALSE) #创建空白数据框
for (i in names(freq_all)) {
df <- data.frame(freq_all[[i]])
names(df)[1] <- c("Country")
names(df)[2] <- i
result_freq <- full_join(result_freq,df,by="Country") #将生成结果填充在数据框result_freq中
} #生成第一至四名的频数数据框
result_freq[is.na(result_freq)] <- 0 #填充整个数据框中存在缺失的单元格
3.5.1 冠亚军分布
win_second <- subset(result_freq,Winner!=0 | Second!=0,select=c(1,2,3)) #提取仅含冠亚军的数据
win_second <- gather(win_second,key="rank",value="count",-Country) #宽数据转换为长数据
win_second <- arrange(win_second,-count) #根据count列降序
ggplot(win_second,aes(x=reorder(Country,-count),y=count,fill=rank))+
geom_bar(stat="identity",position="dodge")+
labs(title="21届世界杯比赛中获得冠亚军的国家次数统计",x="国家",y="次数")+
theme(axis.text.x = element_text(angle=45),
plot.title=element_text(hjust=0.5,vjust=0.5,size=18,face="bold"))
3.5.2 第三四名分布
third_fourth <- subset(result_freq,Third!=0 | Fourth!=0,select=c(1,4,5))
third_fourth <- gather(third_fourth,key="rank",value="count",-Country)
third_fourth <- arrange(third_fourth,-count)
ggplot(third_fourth,aes(x=reorder(Country,count),y=count,fill=rank))+
geom_bar(stat="identity",position="dodge")+
labs(title="21届世界杯比赛中获得第三四名的国家次数统计",x="国家",y="次数")+
coord_flip()+
theme(plot.title=element_text(hjust=0.5,face="bold"))
3.6 经常打入半决赛、决赛的队伍,及其夺冠概率
3.6.1 进入半决赛的次数
result_freq$semi <- apply(result_freq[2:5],1,sum)
result_freq$final <- apply(result_freq[2:3],1,sum)
m3 <- result_freq[result_freq$semi!=0,]
ggplot(m3,aes(x=Country,y=semi,fill=semi))+
geom_bar(stat="identity")+
geom_text(aes(y=semi+0.25,label=semi,size=4))+
labs(title="进入半决赛的次数",x="国家",y="次数")+
theme(plot.title=element_text(hjust=0.5,size=16,face="bold"),
axis.text.x=element_text(size=13,angle=45,vjust=0.8),
axis.text.y=element_text(size=13),
legend.position = "none")
3.6.2 进入决赛的次数
m4 <- result_freq[result_freq$final!=0,]
ggplot(m4,aes(x=Country,y=final,fill=final))+
geom_bar(stat="identity")+
geom_text(aes(y=final+0.25,label=final,size=4))+
labs(title="进入决赛的次数",x="国家",y="次数")+
theme(plot.title=element_text(hjust=0.5,size=16,face="bold"),
axis.text.x=element_text(size=13,angle=45,vjust=0.8),
axis.text.y=element_text(size=13),
legend.position = "none")
3.6.3 进入决赛后夺冠的概率
result_freq$p_win <- round((result_freq$Winner/result_freq$final)*100,2)
result_freq_pwin <- result_freq[!is.na(result_freq$p_win),]
ggplot(result_freq_pwin,aes(x=Country,y=p_win))+
geom_bar(stat="identity",fill="#6595A3")+
geom_text(aes(y=p_win+2),label=paste(result_freq_pwin$p_win,"%",seq=""),size=4)+
labs(title="进入决赛之后夺冠概率",y="夺冠概率(单位:“%)")+
theme(plot.title=element_text(hjust=0.5,size=13,face="bold"),
axis.text.y=element_text(size=13),
axis.text.x=element_text(size=13,angle=45))
3.7 东道主进入半决赛、决赛、夺冠的概率
3.7.1 东道主进入半决赛的概率
p_hostcountry <- WorldCupsSummary[,2:6]
p_hostcountry[1,]
for (i in 1:length(p_hostcountry$HostCountry)) {
if (p_hostcountry$HostCountry[i] %in% p_hostcountry[i,c(2,3,4,5)]) {
p_hostcountry$semi[i] <- 1
} else {p_hostcountry$semi[i] <- 0}
}
layout(matrix(1:2,1,2,byrow=T))
pie(table(p_hostcountry$semi),col=c("#E64B35CC","#4DBBD5CC"),
labels=c("未进入半决赛:42.86%","进入半决赛:57.14%"),
main="东道主进入四强的概率")
barplot(table(p_hostcountry$semi),col=c("#E64B35CC","#4DBBD5CC"),axes = F,
ylab="次数")
axis(side=2,at=seq(0,12,1),las=2)
3.7.2 东道主进入决赛的概率
for (i in 1:length(p_hostcountry$HostCountry)) {
if (p_hostcountry$HostCountry[i] %in% p_hostcountry[i,c(2,3)]) {
p_hostcountry$final[i] <- 1
} else {p_hostcountry$final[i] <- 0}
}
layout(matrix(1:2,1,2,byrow=T))
pie(table(p_hostcountry$final),col=c("#E64B35CC","#4DBBD5CC"),
labels=c("未进入决赛:61.9%","进入决赛:38.1%"),
main="东道主进入决赛的概率")
barplot(table(p_hostcountry$final),col=c("#E64B35CC","#4DBBD5CC"),axes = F,
ylab="次数")
axis(side=2,at=seq(0,13,1),las=2)
3.7.3 东道主夺冠的概率
for (i in 1:length(p_hostcountry$HostCountry)) {
if (p_hostcountry$HostCountry[i]==p_hostcountry$Winner[i]) {
p_hostcountry$win[i] <- 1
} else {p_hostcountry$win[i] <- 0}
}
layout(matrix(1:2,1,2,byrow=T))
pie(table(p_hostcountry$win),col=c("#E64B35CC","#4DBBD5CC"),
labels=c("未夺冠:71.43%","夺冠:28.57%"),
main="东道主夺冠的概率")
barplot(table(p_hostcountry$win),col=c("#E64B35CC","#4DBBD5CC"),axes = F,
ylab="次数")
axis(side=2,at=seq(0,15,1),las=2)
3.8 历届世界杯总进球数变化趋势
t1 <- c(1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,1978,1982,1986,1990,
1994,1998,2002,2006,2010,2014,2018)
plot(WorldCupsSummary$Year,WorldCupsSummary$GoalsScored,type="b",
pch=16,lty=1,lwd=2,xaxt="n",yaxt="n",
main="1930-2018年总进球数变化情况",xlab="年份",ylab="总进球数")
axis(side=1,at=t1,labels=t1)
axis(side=2,at=seq(70,180,10),las=2)
3.9 历届世界杯现场观看人数变化趋势、年平均现场观看人数
#历届世界杯现场观看人数变化趋势
attendence_avg_all <- subset(WorldCupsSummary,select=c(Year,MatchesPlayed,Attendance))
attendence_avg_all <- mutate(attendence_avg_all,avg_attendance=Attendance/MatchesPlayed)
plot(attendence_avg_all$Year,attendence_avg_all$Attendance,type="b",
pch=19,lty=4,lwd=2,col="blue",xaxt="n",yaxt="n",
main="1930-2018年现场观众变化情况",xlab="年份",ylab="现场观众总人数(单位:万)")
axis(side=1,at=t1,labels=t1)
axis(side=2,at=seq(360000,3600000,360000),labels=seq(3.6,36,3.6),las=2)
#年平均现场观看人数
plot(attendence_avg_all$Year,attendence_avg_all$avg_attendance,type="b",
pch=16,lty=2,lwd=2,col="red",xaxt="n",yaxt="n",ylab="",xlab="")
axis(side=4,at=seq(20000,75000,5000),labels=seq(2,7.5,0.5),las=2)
mtext(side=4,line=2.5,"年均现场观看人数(单位:万")
legend("topleft",inset=0.01,title="现场观众人数",c("总人数","年平均人数"),lty=c(4,2),
pch=c(19,16),col=c("blue","red"))
3.10 历届世界杯总参赛队伍变化趋势
plot(WorldCupsSummary$Year,WorldCupsSummary$QualifiedTeams,type="b",
pch=17,cex=1.5,lty=2,lwd=2,xaxt="n",yaxt="n",
main="1930-2018年总参赛队伍数变化情况",xlab="年份",ylab="总参赛队伍数")
axis(side=1,at=t1,labels=t1)
axis(side=2,at=seq(13,35,1),las=2)
3.11 历届世界杯总比赛场数变化趋势
plot(WorldCupsSummary$Year,WorldCupsSummary$MatchesPlayed,type="b",
pch=18,lty=3,lwd=2,xaxt="n",yaxt="n",
main="1930-2018年总比赛场数变化情况",xlab="年份",ylab="总比赛场数")
axis(side=1,at=t1,labels=t1)
axis(side=2,at=seq(17,67,5),las=2)