Bokeh & Seaborn(Vaccinating)

Bokeh & Seaborn(Vaccinating)#

Main Reference: https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html

用來做範例的這個資料是COVID疫情期間的各國疫苗接種資料。資料包含不同國家在不同日期所上傳的資料。要注意的是，這份資料的空值相當的多，有看得出來是空值的資料（如某些項目沒有填寫），也有沒有填寫的天數。每個國家開始登記的日期、漏登的日期、後來不再追蹤的日期都不一定，因此對齊資料的日期、決定資料可回答問題的區間都非常辛苦。

Load vaccination data#

import pandas as pd
raw = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

/Users/jirlong/opt/anaconda3/lib/python3.9/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.8.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
/Users/jirlong/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.4' currently installed).
  from pandas.core import (

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Input In [1], in <cell line: 2>()
      1 import pandas as pd
----> 2 raw = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/common.py:728, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    725     codecs.lookup_error(errors)
    727 # open URLs
--> 728 ioargs = _get_filepath_or_buffer(
    729     path_or_buf,
    730     encoding=encoding,
    731     compression=compression,
    732     mode=mode,
    733     storage_options=storage_options,
    734 )
    736 handle = ioargs.filepath_or_buffer
    737 handles: list[BaseBuffer]

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/common.py:389, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    386         if content_encoding == "gzip":
    387             # Override compression based on Content-Encoding header
    388             compression = {"method": "gzip"}
--> 389         reader = BytesIO(req.read())
    390     return IOArgs(
    391         filepath_or_buffer=reader,
    392         encoding=encoding,
   (...)
    395         mode=fsspec_mode,
    396     )
    398 if is_fsspec_url(filepath_or_buffer):

File ~/opt/anaconda3/lib/python3.9/http/client.py:476, in HTTPResponse.read(self, amt)
    474 else:
    475     try:
--> 476         s = self._safe_read(self.length)
    477     except IncompleteRead:
    478         self._close_conn()

File ~/opt/anaconda3/lib/python3.9/http/client.py:626, in HTTPResponse._safe_read(self, amt)
    624 s = []
    625 while amt > 0:
--> 626     chunk = self.fp.read(min(amt, MAXAMOUNT))
    627     if not chunk:
    628         raise IncompleteRead(b''.join(s), amt)

File ~/opt/anaconda3/lib/python3.9/socket.py:704, in SocketIO.readinto(self, b)
    702 while True:
    703     try:
--> 704         return self._sock.recv_into(b)
    705     except timeout:
    706         self._timeout_occurred = True

File ~/opt/anaconda3/lib/python3.9/ssl.py:1241, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1237     if flags != 0:
   1238         raise ValueError(
   1239           "non-zero flags not allowed in calls to recv_into() on %s" %
   1240           self.__class__)
-> 1241     return self.read(nbytes, buffer)
   1242 else:
   1243     return super().recv_into(buffer, nbytes, flags)

File ~/opt/anaconda3/lib/python3.9/ssl.py:1099, in SSLSocket.read(self, len, buffer)
   1097 try:
   1098     if buffer is not None:
-> 1099         return self._sslobj.read(len, buffer)
   1100     else:
   1101         return self._sslobj.read(len)

KeyboardInterrupt: 

raw

	iso_code	continent	location	date	total_cases	new_cases	new_cases_smoothed	total_deaths	new_deaths	new_deaths_smoothed	...	male_smokers	handwashing_facilities	hospital_beds_per_thousand	life_expectancy	human_development_index	population	excess_mortality_cumulative_absolute	excess_mortality_cumulative	excess_mortality	excess_mortality_cumulative_per_million
0	AFG	Asia	Afghanistan	2020-02-24	5.0	5.0	NaN	NaN	NaN	NaN	...	NaN	37.746	0.5	64.83	0.511	41128772.0	NaN	NaN	NaN	NaN
1	AFG	Asia	Afghanistan	2020-02-25	5.0	0.0	NaN	NaN	NaN	NaN	...	NaN	37.746	0.5	64.83	0.511	41128772.0	NaN	NaN	NaN	NaN
2	AFG	Asia	Afghanistan	2020-02-26	5.0	0.0	NaN	NaN	NaN	NaN	...	NaN	37.746	0.5	64.83	0.511	41128772.0	NaN	NaN	NaN	NaN
3	AFG	Asia	Afghanistan	2020-02-27	5.0	0.0	NaN	NaN	NaN	NaN	...	NaN	37.746	0.5	64.83	0.511	41128772.0	NaN	NaN	NaN	NaN
4	AFG	Asia	Afghanistan	2020-02-28	5.0	0.0	NaN	NaN	NaN	NaN	...	NaN	37.746	0.5	64.83	0.511	41128772.0	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
233050	ZWE	Africa	Zimbabwe	2022-11-02	257893.0	0.0	0.0	5606.0	0.0	0.0	...	30.7	36.791	1.7	61.49	0.571	16320539.0	NaN	NaN	NaN	NaN
233051	ZWE	Africa	Zimbabwe	2022-11-03	257893.0	0.0	0.0	5606.0	0.0	0.0	...	30.7	36.791	1.7	61.49	0.571	16320539.0	NaN	NaN	NaN	NaN
233052	ZWE	Africa	Zimbabwe	2022-11-04	257893.0	0.0	0.0	5606.0	0.0	0.0	...	30.7	36.791	1.7	61.49	0.571	16320539.0	NaN	NaN	NaN	NaN
233053	ZWE	Africa	Zimbabwe	2022-11-05	257893.0	0.0	0.0	5606.0	0.0	0.0	...	30.7	36.791	1.7	61.49	0.571	16320539.0	NaN	NaN	NaN	NaN
233054	ZWE	Africa	Zimbabwe	2022-11-06	257893.0	0.0	0.0	5606.0	0.0	0.0	...	30.7	36.791	1.7	61.49	0.571	16320539.0	NaN	NaN	NaN	NaN

233055 rows × 67 columns

Observing data#

raw.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred',
       'new_vaccinations_smoothed_per_million',
       'new_people_vaccinated_smoothed',
       'new_people_vaccinated_smoothed_per_hundred', 'stringency_index',
       'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
       'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate',
       'diabetes_prevalence', 'female_smokers', 'male_smokers',
       'handwashing_facilities', 'hospital_beds_per_thousand',
       'life_expectancy', 'human_development_index', 'population',
       'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative',
       'excess_mortality', 'excess_mortality_cumulative_per_million'],
      dtype='object')

計算每個洲（continent）有多少資料。每個洲會高達數萬筆資料，原因是因為每一列是一個國家一天的資料。

print(set(raw.continent))
raw.continent.value_counts()

{nan, 'Oceania', 'Africa', 'South America', 'Asia', 'North America', 'Europe'}

Europe           53357
Africa           52948
Asia             49281
North America    35177
Oceania          16422
South America    12716
Name: continent, dtype: int64

Filtering data#

Since the purpose is to understand the similarities and differences between Taiwan’s and other countries, the following only deals with Asian data, including South Korea, Japan and other countries that deal with the epidemic situation similar to my country’s.

df_asia = raw.loc[raw['continent']=="Asia"]
set(df_asia.location)

{'Afghanistan',
 'Armenia',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Bhutan',
 'Brunei',
 'Cambodia',
 'China',
 'Georgia',
 'Hong Kong',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Israel',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kuwait',
 'Kyrgyzstan',
 'Laos',
 'Lebanon',
 'Macao',
 'Malaysia',
 'Maldives',
 'Mongolia',
 'Myanmar',
 'Nepal',
 'North Korea',
 'Northern Cyprus',
 'Oman',
 'Pakistan',
 'Palestine',
 'Philippines',
 'Qatar',
 'Saudi Arabia',
 'Singapore',
 'South Korea',
 'Sri Lanka',
 'Syria',
 'Taiwan',
 'Tajikistan',
 'Thailand',
 'Timor',
 'Turkey',
 'Turkmenistan',
 'United Arab Emirates',
 'Uzbekistan',
 'Vietnam',
 'Yemen'}

# Using .loc() to filter location == Taiwan
# df_tw = df_asia.loc[df_asia['location'] == "Taiwan"]

# Using pandas.Dataframe.query() function
df_tw = df_asia.query('location == "Taiwan"')
df_tw

	iso_code	continent	location	date	total_cases	new_cases	new_cases_smoothed	total_deaths	new_deaths	new_deaths_smoothed	...	male_smokers	handwashing_facilities	hospital_beds_per_thousand	life_expectancy	human_development_index	population	excess_mortality_cumulative_absolute	excess_mortality_cumulative	excess_mortality	excess_mortality_cumulative_per_million
203086	TWN	Asia	Taiwan	2020-01-16	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
203087	TWN	Asia	Taiwan	2020-01-17	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
203088	TWN	Asia	Taiwan	2020-01-18	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
203089	TWN	Asia	Taiwan	2020-01-19	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
203090	TWN	Asia	Taiwan	2020-01-20	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
204107	TWN	Asia	Taiwan	2022-11-02	7780125.0	33156.0	32034.429	12929.0	53.0	64.286	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
204108	TWN	Asia	Taiwan	2022-11-03	7810077.0	29952.0	31219.429	13010.0	81.0	63.857	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
204109	TWN	Asia	Taiwan	2022-11-04	7837658.0	27581.0	30222.143	13084.0	74.0	66.286	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
204110	TWN	Asia	Taiwan	2022-11-05	7863193.0	25535.0	29230.429	13151.0	67.0	65.000	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
204111	TWN	Asia	Taiwan	2022-11-06	7887538.0	24345.0	28204.000	13198.0	47.0	60.857	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN

1026 rows × 67 columns

df_tw.dtypes

iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
population                                 float64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object

Line plot of time series#

由於要以時間（日期）當成X軸來繪圖，所以要先偵測看看目前的日期（date）變數型態為何（由於載下來的資料是CSV，八成是字串，偶而會是整數），所以會需要將日期的字串轉為Python的時間物件datetime。

print(type(df_tw.date))
# <class 'pandas.core.series.Series'>

print(df_tw.date.dtype)
# object (str)

# Converting columns to datetime
df_tw['date'] = pd.to_datetime(df_tw['date'], format="%Y-%m-%d")

print(df_tw.date.dtype)
# datetime64[ns]

<class 'pandas.core.series.Series'>
datetime64[ns]
datetime64[ns]

/var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/ipykernel_38668/1951838620.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tw['date'] = pd.to_datetime(df_tw['date'], format="%Y-%m-%d")

df_tw

	iso_code	continent	location	date	total_cases	new_cases	new_cases_smoothed	total_deaths	new_deaths	new_deaths_smoothed	...	male_smokers	handwashing_facilities	hospital_beds_per_thousand	life_expectancy	human_development_index	population	excess_mortality_cumulative_absolute	excess_mortality_cumulative	excess_mortality	excess_mortality_cumulative_per_million
203086	TWN	Asia	Taiwan	2020-01-16	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
203087	TWN	Asia	Taiwan	2020-01-17	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
203088	TWN	Asia	Taiwan	2020-01-18	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
203089	TWN	Asia	Taiwan	2020-01-19	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
203090	TWN	Asia	Taiwan	2020-01-20	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
204107	TWN	Asia	Taiwan	2022-11-02	7780125.0	33156.0	32034.429	12929.0	53.0	64.286	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
204108	TWN	Asia	Taiwan	2022-11-03	7810077.0	29952.0	31219.429	13010.0	81.0	63.857	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
204109	TWN	Asia	Taiwan	2022-11-04	7837658.0	27581.0	30222.143	13084.0	74.0	66.286	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
204110	TWN	Asia	Taiwan	2022-11-05	7863193.0	25535.0	29230.429	13151.0	67.0	65.000	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN
204111	TWN	Asia	Taiwan	2022-11-06	7887538.0	24345.0	28204.000	13198.0	47.0	60.857	...	NaN	NaN	NaN	80.46	NaN	23893396.0	NaN	NaN	NaN	NaN

1026 rows × 67 columns

Plot 1 line by Pandas#

args

figsize=(10,5): The size in figsize=(5,3) is given in inches per (width, height). See https://stackoverflow.com/questions/51174691/how-to-increase-image-size-of-pandas-dataframe-plot

df_tw.plot(x="date", y="new_cases", figsize=(10, 5))

<AxesSubplot:xlabel='date'>

../_images/05ace346069af120e390cb0adc56b129b37952d4d42461bd1f4ece87025d0430.png

Plot multiple lines#

要繪製單一變項（一個國家）的折線圖很容易，X軸為日期、Y軸為案例數。但要如何繪製多個國家、多條折線圖（每個國家一條線）？以下就以日本和台灣兩國的數據為例來進行繪製。

location這個欄位紀錄了該列資料屬於日本或台灣。通常視覺化軟體會有兩種作法，一種做法是必須把日本和台灣在欄的方向展開（用df.pivot()），變成兩個變項，日本和台灣各一個變項，Python最基本的繪圖函式庫matplotlib就必須這麼做。但如果用號稱是matplotlib的進階版seaborn，則可以指定location這個變項作為群組資訊，簡單地說是用location當成群組變數來繪製不同的線。

df1 = df_asia.loc[df_asia['location'].isin(["Taiwan", "Japan"])]
df1['date'] = pd.to_datetime(df1['date'], format="%Y-%m-%d")
set(df1.location)

/var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/ipykernel_38668/2904734101.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['date'] = pd.to_datetime(df1['date'], format="%Y-%m-%d")

{'Japan', 'Taiwan'}

df1[['location', 'date', 'new_cases']]

	location	date	new_cases
104856	Japan	2020-01-22	NaN
104857	Japan	2020-01-23	0.0
104858	Japan	2020-01-24	0.0
104859	Japan	2020-01-25	0.0
104860	Japan	2020-01-26	2.0
...	...	...	...
204107	Taiwan	2022-11-02	33156.0
204108	Taiwan	2022-11-03	29952.0
204109	Taiwan	2022-11-04	27581.0
204110	Taiwan	2022-11-05	25535.0
204111	Taiwan	2022-11-06	24345.0

2046 rows × 3 columns

# df1 data contains more than 1 location

df1.plot(x="date", y="new_cases", figsize=(10, 5))

<AxesSubplot:xlabel='date'>

../_images/2b3f91e686581fa46fcc0947e8928f7c4d40c25dfe9be41d5bdbbd64d8d9d608.png

df_wide = df1.pivot(index="date", columns="location", 
                    values=["new_cases", "total_cases", "total_vaccinations_per_hundred"])
df_wide

	new_cases		total_cases		total_vaccinations_per_hundred
location	Japan	Taiwan	Japan	Taiwan	Japan	Taiwan
date
2020-01-16	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-17	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-18	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-19	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-20	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...
2022-11-02	70396.0	33156.0	22460268.0	7780125.0	268.25	264.69
2022-11-03	67473.0	29952.0	22527741.0	7810077.0	268.34	264.83
2022-11-04	34064.0	27581.0	22561805.0	7837658.0	268.56	265.00
2022-11-05	74170.0	25535.0	22635975.0	7863193.0	268.81	NaN
2022-11-06	66397.0	24345.0	22702372.0	7887538.0	268.90	265.15

1026 rows × 6 columns

`fillna()`#

df_wide.fillna(0, inplace=True)
df_wide.new_cases.Taiwan
df_wide

	new_cases		total_cases		total_vaccinations_per_hundred
location	Japan	Taiwan	Japan	Taiwan	Japan	Taiwan
date
2020-01-16	0.0	0.0	0.0	0.0	0.00	0.00
2020-01-17	0.0	0.0	0.0	0.0	0.00	0.00
2020-01-18	0.0	0.0	0.0	0.0	0.00	0.00
2020-01-19	0.0	0.0	0.0	0.0	0.00	0.00
2020-01-20	0.0	0.0	0.0	0.0	0.00	0.00
...	...	...	...	...	...	...
2022-11-02	70396.0	33156.0	22460268.0	7780125.0	268.25	264.69
2022-11-03	67473.0	29952.0	22527741.0	7810077.0	268.34	264.83
2022-11-04	34064.0	27581.0	22561805.0	7837658.0	268.56	265.00
2022-11-05	74170.0	25535.0	22635975.0	7863193.0	268.81	0.00
2022-11-06	66397.0	24345.0	22702372.0	7887538.0	268.90	265.15

1026 rows × 6 columns

`reset_index()`#

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html

在經過pivot後，列方向會變成以date為index，此時我希望將data恢復為欄方向的變數，就需要用reset_index()。

df_wide.reset_index(inplace=True)
df_wide

	date	new_cases		total_cases		total_vaccinations_per_hundred
location		Japan	Taiwan	Japan	Taiwan	Japan	Taiwan
0	2020-01-16	0.0	0.0	0.0	0.0	0.00	0.00
1	2020-01-17	0.0	0.0	0.0	0.0	0.00	0.00
2	2020-01-18	0.0	0.0	0.0	0.0	0.00	0.00
3	2020-01-19	0.0	0.0	0.0	0.0	0.00	0.00
4	2020-01-20	0.0	0.0	0.0	0.0	0.00	0.00
...	...	...	...	...	...	...	...
1021	2022-11-02	70396.0	33156.0	22460268.0	7780125.0	268.25	264.69
1022	2022-11-03	67473.0	29952.0	22527741.0	7810077.0	268.34	264.83
1023	2022-11-04	34064.0	27581.0	22561805.0	7837658.0	268.56	265.00
1024	2022-11-05	74170.0	25535.0	22635975.0	7863193.0	268.81	0.00
1025	2022-11-06	66397.0	24345.0	22702372.0	7887538.0	268.90	265.15

1026 rows × 7 columns

Visualized by matplotlib with pandas#

後面加上figsize參數可以調整長寬比。 pandas.DataFrame.plot的可用參數可見https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html。

df_wide.plot(x="date", y="new_cases", figsize=(10, 5))

<AxesSubplot:xlabel='date'>

../_images/7212a7135e3ac44944aa1fa04fcd203209904ebe04860f50384c206468777520.png

More params#

例如對Y軸取log。 pandas.DataFrame.plot的可用參數可見https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html。

df_wide.plot(x="date", y="new_cases", figsize=(10, 5), logy=True)

<AxesSubplot:xlabel='date'>

../_images/3467c8f49c18f979032a84fc114ccdca7c1378444bf204680aff11992eabc379.png

Visualized by seaborn#

seaborn可以將location作為群組變數，不同組的就繪製在不同的線。

以下先將location、date、new_cases取出後，把NA值填0。

df1 = df_asia.loc[df_asia['location'].isin(["Taiwan", "Japan", "South Korea"])]
df1['date'] = pd.to_datetime(df1['date'], format="%Y-%m-%d")
df_sns = df1[["location", 'date', 'new_cases']].fillna(0)
df_sns

/var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/ipykernel_38668/2059310855.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['date'] = pd.to_datetime(df1['date'], format="%Y-%m-%d")

	location	date	new_cases
104856	Japan	2020-01-22	0.0
104857	Japan	2020-01-23	0.0
104858	Japan	2020-01-24	0.0
104859	Japan	2020-01-25	0.0
104860	Japan	2020-01-26	2.0
...	...	...	...
204107	Taiwan	2022-11-02	33156.0
204108	Taiwan	2022-11-03	29952.0
204109	Taiwan	2022-11-04	27581.0
204110	Taiwan	2022-11-05	25535.0
204111	Taiwan	2022-11-06	24345.0

3066 rows × 3 columns

Seaborn繪圖還是基於matplotlib套件，但他的lineplot()可以多給一個參數hue，並將location指定給該參數，這樣繪圖時便會依照不同的location進行繪圖。

import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(11, 6))
sns.lineplot(data=df_sns, x='date', y='new_cases', hue='location', ax=ax)

<AxesSubplot:xlabel='date', ylabel='new_cases'>

../_images/f17de11095a299dddbe955c8eb4085c0676b66f32b22f0a6ea5c87d3ade81665.png

Visualized by bokeh: `plot_bokeh()`#

https://towardsdatascience.com/beautiful-and-easy-plotting-in-python-pandas-bokeh-afa92d792167
https://patrikhlobil.github.io/Pandas-Bokeh/ (Document of Pandas-Bokeh)

Bokeh的功能則是可以提供可互動的視覺化。但他不吃Pandas的MultiIndex，所以要將Pandas的階層欄位扁平化。以下是其中一種做法。做完扁平化就可以使用bokeh的函數來進行繪圖。

from bokeh.plotting import figure, show
from bokeh.io import output_notebook

# !pip install pandas_bokeh
import pandas_bokeh
pandas_bokeh.output_notebook()

Collecting pandas_bokeh
  Downloading pandas_bokeh-0.5.5-py2.py3-none-any.whl (29 kB)
Requirement already satisfied: bokeh>=2.0 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from pandas_bokeh) (2.4.2)
Requirement already satisfied: pandas>=0.22.0 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from pandas_bokeh) (1.4.2)
Requirement already satisfied: packaging>=16.8 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (21.3)
Requirement already satisfied: tornado>=5.1 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (6.1)
Requirement already satisfied: pillow>=7.1.0 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (9.0.1)
Requirement already satisfied: Jinja2>=2.9 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (2.11.3)
Requirement already satisfied: PyYAML>=3.10 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (6.0)
Requirement already satisfied: typing-extensions>=3.10.0 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (4.1.1)
Requirement already satisfied: numpy>=1.11.3 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (1.21.5)
Requirement already satisfied: MarkupSafe>=0.23 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from Jinja2>=2.9->bokeh>=2.0->pandas_bokeh) (2.0.1)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from packaging>=16.8->bokeh>=2.0->pandas_bokeh) (3.0.4)
Requirement already satisfied: python-dateutil>=2.8.1 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from pandas>=0.22.0->pandas_bokeh) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from pandas>=0.22.0->pandas_bokeh) (2021.3)
Requirement already satisfied: six>=1.5 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas>=0.22.0->pandas_bokeh) (1.16.0)
Installing collected packages: pandas-bokeh
Successfully installed pandas-bokeh-0.5.5

Loading BokehJS ...

df_wide2 = df_wide.copy()
df_wide2.columns = df_wide.columns.map('_'.join)
df_wide2

	date_	new_cases_Japan	new_cases_Taiwan	total_cases_Japan	total_cases_Taiwan	total_vaccinations_per_hundred_Japan	total_vaccinations_per_hundred_Taiwan
0	2020-01-16	0.0	0.0	0.0	0.0	0.00	0.00
1	2020-01-17	0.0	0.0	0.0	0.0	0.00	0.00
2	2020-01-18	0.0	0.0	0.0	0.0	0.00	0.00
3	2020-01-19	0.0	0.0	0.0	0.0	0.00	0.00
4	2020-01-20	0.0	0.0	0.0	0.0	0.00	0.00
...	...	...	...	...	...	...	...
1021	2022-11-02	70396.0	33156.0	22460268.0	7780125.0	268.25	264.69
1022	2022-11-03	67473.0	29952.0	22527741.0	7810077.0	268.34	264.83
1023	2022-11-04	34064.0	27581.0	22561805.0	7837658.0	268.56	265.00
1024	2022-11-05	74170.0	25535.0	22635975.0	7863193.0	268.81	0.00
1025	2022-11-06	66397.0	24345.0	22702372.0	7887538.0	268.90	265.15

1026 rows × 7 columns

df_wide2.plot_bokeh(
    kind='line',
    x='date_',
    y=['new_cases_Japan', 'new_cases_Taiwan']
)

Figure(

id = '1003', …)

Bar chart: vaccinating rate#

df_asia.dtypes

iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
population                                 float64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object

df_asia['date'] = pd.to_datetime(df_asia.date)
print(df_asia.date.dtype)

datetime64[ns]

/var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/ipykernel_38668/1918172663.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_asia['date'] = pd.to_datetime(df_asia.date)

max(df_asia.date)

Timestamp('2022-11-07 00:00:00')

import datetime
df_recent = df_asia.loc[df_asia['date'] == datetime.datetime(2021, 10, 28)]

by pure pandas#

# df_recent.columns
df_recent.plot.barh(x="location", y="total_vaccinations_per_hundred")

<AxesSubplot:ylabel='location'>

../_images/1e6dad14c34b0aaa88d850ed2eee982efa98c89f43740ca7c52855f408c7aa9d.png

df_recent.plot.barh(x="location", y="total_vaccinations_per_hundred", figsize=(10, 10))

<AxesSubplot:ylabel='location'>

../_images/73ea1e2012059fca5420e1085819c42db4754f0eb9a3bde4395d5d72dc095528.png

df_recent.sort_values('total_vaccinations_per_hundred', ascending=True).plot.barh(x="location", y="total_vaccinations_per_hundred", figsize=(10, 10))

<AxesSubplot:ylabel='location'>

../_images/82271f3a3d6995b9400ec052525b08467fb560217c7f632700d8fec62f3d4e2a.png

df_recent.fillna(0).sort_values('total_vaccinations_per_hundred', ascending=True).plot.barh(x="location", y="total_vaccinations_per_hundred", figsize=(10, 10))

<AxesSubplot:ylabel='location'>

../_images/e128d7ab226a75a30fbdc04e405df125033c3c407277c1d382f9670b71f61618.png

by plot_bokeh#

toplot = df_recent.fillna(0).sort_values('total_vaccinations_per_hundred', ascending=True)
toplot.plot_bokeh(kind="barh", x="location", y="total_vaccinations_per_hundred")

Figure(

id = '1282', …)

Bokeh Settings#

Displaying output in jupyter notebook#

from bokeh.io import output_notebook
output_notebook()