11. Seaborn数据可视化¶
11.1. Seaborn和Matplotlib的对比¶
# 准备环境
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 准备数据
rng = np.random.RandomState(0)
x = np.linspace(0, 10, 500)
y = np.cumsum(rng.randn(500, 6), 0)
plt.plot(x, y)
plt.legend("ABCDEF", ncol=2, loc='upper left')
<matplotlib.legend.Legend at 0x7f4ff8e5f9e8>
# 上述图用seaborn来实现
import seaborn as sns
plt.plot(x, y)
plt.legend("ABCDEF", ncol=2, loc='upper left')
<matplotlib.legend.Legend at 0x7f50147832b0>
11.2. Seaborn图形介绍¶
11.2.1. 频次直方图,KDE和密度图¶
# 直方图
data = np.random.multivariate_normal([0, 0], [[5, 2], [2, 2]], size=2000)
data = pd.DataFrame(data, columns=['X', 'Y'])
for col in 'XY':
plt.hist(data[col], alpha=0.5)
# sns.kedplot可以实现KDE变量帆布的平滑估计
for col in 'XY':
sns.kdeplot(data[col], shade=True)
/sw/ana/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
# distplot可以让频次直方图和KDE结合起来
/sw/ana/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
<matplotlib.axes._subplots.AxesSubplot at 0x7f4ff6c17da0>
# 如果是想kdeplot输入的二维数据,则可以获得二维数据的可视化
/sw/ana/lib/python3.7/site-packages/seaborn/distributions.py:679: UserWarning: Passing a 2D dataset for a bivariate plot is deprecated in favor of kdeplot(x, y), and it will cause an error in future versions. Please update your code.
warnings.warn(warn_msg, UserWarning)
/sw/ana/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
<matplotlib.axes._subplots.AxesSubplot at 0x7f4ff6b99f28>
# jointplot可以同时看到两个变量的联合分布和单变量的独立分布
with sns.axes_style('white'):
sns.jointplot('X', 'Y', data, kind='kde')
/sw/ana/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
# 向jointplot函数传递一些参数,可以用六边形块代替频次直方图。
with sns.axes_style("white"):
sns.jointplot('X', 'Y', data, kind='hex')
/sw/ana/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
11.2.2. 矩阵图¶
对多维数据集进行可视化时,需要用到矩阵图(pair plot)来表示变量中任意两个变量的关系,探索多维数据不同维度的相关性。
import seaborn as sns
# 载入鸢尾花数据集
# 鸢尾花数据集研究的是花瓣和花萼的尺寸和鸢尾花品种的关系
# 数据从Github下载,可能需要多试几次才能成功
iris = sns.load_dataset('iris')
sepal_length | sepal_width | petal_length | petal_width | species | |
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
sns.pairplot(iris, hue='species', size=2.5)
/sw/ana/lib/python3.7/site-packages/seaborn/axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code.
warnings.warn(msg, UserWarning)
/sw/ana/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
<seaborn.axisgrid.PairGrid at 0x7f385d5cb748>
11.2.3. 分面频次直方图¶
# 再如数据
# tips数据研究的是服务员小费数量和顾客年龄等之间的关系
tips = sns.load_dataset('tips')
total_bill | tip | sex | smoker | day | time | size | |
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
# 把数量变成百分比
tips['tip_pct'] = 100 * tips['tip'] / tips['total_bill']
grid = sns.FacetGrid(tips, row='sex', col='time', margin_titles=True)
grid.map(plt.hist, 'tip_pct', bins=np.linspace(0, 40, 15))
<seaborn.axisgrid.FacetGrid at 0x7f4ff5bccb38>
11.2.4. 因子图¶
因子图(Factor Plot)也是对数据子集进行可视化的方法,可以用来观察一个参数在另一个参数间隔中的分布情况。
with sns.axes_style(style='ticks'):
g = sns.factorplot('day', 'total_bill', 'sex', data=tips, kind='box')
g.set_axis_labels("Day", 'Total_bill')
/sw/ana/lib/python3.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
11.2.5. 联合分布图¶
with sns.axes_style("white"):
sns.jointplot('total_bill', 'tip', data=tips, kind='hex')
/sw/ana/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
sns.jointplot('total_bill', 'tip', data=tips, kind='reg')
/sw/ana/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
<seaborn.axisgrid.JointGrid at 0x7f4ff2f6d5f8>
11.2.6. 条形图¶
# 行星观测数据的展示
planets = sns.load_dataset('planets')
method | number | orbital_period | mass | distance | year | |
0 | Radial Velocity | 1 | 269.300 | 7.10 | 77.40 | 2006 |
1 | Radial Velocity | 1 | 874.774 | 2.21 | 56.95 | 2008 |
2 | Radial Velocity | 1 | 763.000 | 2.60 | 19.84 | 2011 |
3 | Radial Velocity | 1 | 326.030 | 19.40 | 110.62 | 2007 |
4 | Radial Velocity | 1 | 516.220 | 10.50 | 119.47 | 2009 |
with sns.axes_style("white"):
g = sns.factorplot('year', data=planets, aspect=2, \
kind='count', color='steelblue')
/sw/ana/lib/python3.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
with sns.axes_style("white"):
g = sns.factorplot('year', data=planets, aspect=4.0, \
kind='count', hue='method', order=range(2001, 2015))
g.set_ylabels('Number of Planets Discovered')
/sw/ana/lib/python3.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.