Dataquest学习总结[4]

 继续Intermediate Python and Pandas / Data Analysis with Pandas: Intermediate /

Guided Project: Analyzing Thanksgiving Dinner:  数据集地址:here

用到的方法: pandas.Series.value_counts()     

import pandas as pd
data=pd.read_csv("thanksgiving.csv",encoding="Latin-1")
print(data.head(3))

print(data.columns)

print(data["Do you celebrate Thanksgiving?"].value_counts())
data=data[data["Do you celebrate Thanksgiving?"]=="Yes"]
print(len(data))

print(data["What is typically the main dish at your Thanksgiving dinner?"].value_counts())
data_1=data[data["What is typically the main dish at your Thanksgiving dinner?"]=="Tofurkey"]
print(data_1["Do you typically have gravy?"])

apple_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"])
Pumpkin_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"])
Pecan_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"])
ate_pies=apple_isnull&Pumpkin_isnull&Pecan_isnull
print(ate_pies.value_counts())

def str_2_int(str_1):
    if pd.isnull(str_1):
        return None
    str_2=str_1.split(' ')[0]
    len_2=len(str_2)
    if str_2[len_2-1]=='+':
        str_2=str_2[:len_2-1]
    return int(str_2)
data["int_age"]=data["Age"].apply(str_2_int)
print(data.loc[5:20,["Age","int_age"]])

def str_2_int_2(stra):
    if pd.isnull(stra):
        return None
    strb=stra.split(' ')[0]
    if strb=="Prefer":
        return None
    else:
        lenb=len(strb)
        strc=strb[1:lenb-4]+strb[lenb-3:lenb]
        return int(strc)
    
data['int_come']=data["How much total combined money did all members of your HOUSEHOLD earn last year?"].apply(str_2_int_2)
print(data["int_come"].describe())        

data_far=data[data["int_come"]<=150000]["How far will you travel for Thanksgiving?"]
print(data_far.value_counts())
data_far_1=data[data["int_come"]>150000]["How far will you travel for Thanksgiving?"]
print(data_far_1.value_counts())

ave_age=data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?",columns="Have you ever attended a \"Friendsgiving?\"",values="int_age")
print(ave_age)
ave_income=data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?",columns="Have you ever attended a \"Friendsgiving?\"",values="int_come")
print(ave_income)


接下来是关于可视化部分 Intermediate Python and Pandas / Exploratory Data Visualization 

DATEVALUE
1948-01-013.4
>>对于上图这样的数据,pandas会把DATE默认读成字符串的格式,即object类型,需要进行类型转换:

unrate=pd.read_csv("unrate.csv")
unrate["DATE"]=pd.to_datetime(unrate["DATE"])

>>关于画图

import matplotlib.pyplot as plt

plt.plot()

plt.show()

matplotlib.pyplot.xticks(*args, **kwargs)   #对x坐标tick label进行调整,比如旋转90度,plt.xticks(rotation=90)

plt.xlabel("Month")                                           #对x轴加标签
plt.ylabel("Unemployment Rate")
plt.title("Monthly Unemployment Trends, 1948")
 >>构建figure对象,构建子图:

fig = plt.figure()                                      #或者fig = plt.figure(figsize=(width, height))定义figure对象的长宽
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.plot(unrate["DATE"][:12],unrate["VALUE"][:12])
ax2.plot(unrate["DATE"][12:24],unrate["VALUE"][12:24])
plt.show()

>>在同一图中画多条线:

unrate['MONTH'] = unrate['DATE'].dt.month
fig=plt.figure(figsize=(6,3))
plt.plot(unrate["MONTH"][:12],unrate["VALUE"][:12],c="red")
plt.plot(unrate["MONTH"][12:24],unrate["VALUE"][12:24],c="blue")
plt.show()

>>还可以给每条线增加标签和指定标签的位置:

plt.plot(unrate[0:12]['MONTH'], unrate[0:12]['VALUE'], c='red', label='1948')
plt.plot(unrate[12:24]['MONTH'], unrate[12:24]['VALUE'], c='blue', label='1949')

plt.legend(loc='upper left')

>>画直方图:pyplot.bar()     Axes.bar()

import matplotlib.pyplot as plt
from numpy import arange
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_heights = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
fig,ax=plt.subplots()         #返回Figure和Axes对象
ax.bar(bar_positions,bar_heights,0.5)
plt.show()


>>直方图其他操作:

num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_heights = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig,ax=plt.subplots()
ax.bar(bar_positions,bar_heights,0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(num_cols,rotation=90)
plt.xlabel("Rating Source")
plt.ylabel("Average Rating")
plt.title("Average User Rating For Avengers: Age of Ultron (2015)")
plt.show()


>>水平直方图

import matplotlib.pyplot as plt
from numpy import arange
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_widths = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig,ax=plt.subplots()
ax.barh(bar_positions,bar_widths,0.5)
ax.set_yticks(tick_positions)
ax.set_yticklabels(num_cols)
plt.xlabel("Average Rating")
plt.ylabel("Rating Source")
plt.title("Average User Rating For Avengers: Age of Ultron (2015)")
plt.show()


>>绘制散点图:

fig,ax=plt.subplots()
ax.scatter(norm_reviews["Fandango_Ratingvalue"],norm_reviews["RT_user_norm"])
plt.xlabel("Fandango")
plt.ylabel("Rotten Tomatoes")
plt.show()


>>多个子图

fig = plt.figure(figsize=(5,10))
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.scatter(norm_reviews["Fandango_Ratingvalue"],norm_reviews["RT_user_norm"])
ax1.set_xlabel("Fandango")
ax1.set_ylabel("Rotten Tomatoes")
ax2.scatter(norm_reviews["RT_user_norm"],norm_reviews["Fandango_Ratingvalue"])
ax2.set_xlabel("Rotten Tomatoes")
ax2.set_ylabel("Fandango")
plt.show()


>>设置横纵坐标的范围:Axes.set_xlim()  Axes.set_ylim() 

这一部分暂告一段落,关于可视化操作还有下一步的表格显示以及Storytelling Through Data Visualization部分没有学习,先跳过学习Data Cleaning


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值