Bokeh & Seaborn(Vaccinating)#

Main Reference: https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html

用來做範例的這個資料是COVID疫情期間的各國疫苗接種資料。資料包含不同國家在不同日期所上傳的資料。要注意的是,這份資料的空值相當的多,有看得出來是空值的資料(如某些項目沒有填寫),也有沒有填寫的天數。每個國家開始登記的日期、漏登的日期、後來不再追蹤的日期都不一定,因此對齊資料的日期、決定資料可回答問題的區間都非常辛苦。

Load vaccination data#

import pandas as pd
raw = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")
/Users/jirlong/opt/anaconda3/lib/python3.9/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.8.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
/Users/jirlong/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.4' currently installed).
  from pandas.core import (
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Input In [1], in <cell line: 2>()
      1 import pandas as pd
----> 2 raw = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/common.py:728, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    725     codecs.lookup_error(errors)
    727 # open URLs
--> 728 ioargs = _get_filepath_or_buffer(
    729     path_or_buf,
    730     encoding=encoding,
    731     compression=compression,
    732     mode=mode,
    733     storage_options=storage_options,
    734 )
    736 handle = ioargs.filepath_or_buffer
    737 handles: list[BaseBuffer]

File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/io/common.py:389, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    386         if content_encoding == "gzip":
    387             # Override compression based on Content-Encoding header
    388             compression = {"method": "gzip"}
--> 389         reader = BytesIO(req.read())
    390     return IOArgs(
    391         filepath_or_buffer=reader,
    392         encoding=encoding,
   (...)
    395         mode=fsspec_mode,
    396     )
    398 if is_fsspec_url(filepath_or_buffer):

File ~/opt/anaconda3/lib/python3.9/http/client.py:476, in HTTPResponse.read(self, amt)
    474 else:
    475     try:
--> 476         s = self._safe_read(self.length)
    477     except IncompleteRead:
    478         self._close_conn()

File ~/opt/anaconda3/lib/python3.9/http/client.py:626, in HTTPResponse._safe_read(self, amt)
    624 s = []
    625 while amt > 0:
--> 626     chunk = self.fp.read(min(amt, MAXAMOUNT))
    627     if not chunk:
    628         raise IncompleteRead(b''.join(s), amt)

File ~/opt/anaconda3/lib/python3.9/socket.py:704, in SocketIO.readinto(self, b)
    702 while True:
    703     try:
--> 704         return self._sock.recv_into(b)
    705     except timeout:
    706         self._timeout_occurred = True

File ~/opt/anaconda3/lib/python3.9/ssl.py:1241, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1237     if flags != 0:
   1238         raise ValueError(
   1239           "non-zero flags not allowed in calls to recv_into() on %s" %
   1240           self.__class__)
-> 1241     return self.read(nbytes, buffer)
   1242 else:
   1243     return super().recv_into(buffer, nbytes, flags)

File ~/opt/anaconda3/lib/python3.9/ssl.py:1099, in SSLSocket.read(self, len, buffer)
   1097 try:
   1098     if buffer is not None:
-> 1099         return self._sslobj.read(len, buffer)
   1100     else:
   1101         return self._sslobj.read(len)

KeyboardInterrupt: 
raw
iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed ... male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index population excess_mortality_cumulative_absolute excess_mortality_cumulative excess_mortality excess_mortality_cumulative_per_million
0 AFG Asia Afghanistan 2020-02-24 5.0 5.0 NaN NaN NaN NaN ... NaN 37.746 0.5 64.83 0.511 41128772.0 NaN NaN NaN NaN
1 AFG Asia Afghanistan 2020-02-25 5.0 0.0 NaN NaN NaN NaN ... NaN 37.746 0.5 64.83 0.511 41128772.0 NaN NaN NaN NaN
2 AFG Asia Afghanistan 2020-02-26 5.0 0.0 NaN NaN NaN NaN ... NaN 37.746 0.5 64.83 0.511 41128772.0 NaN NaN NaN NaN
3 AFG Asia Afghanistan 2020-02-27 5.0 0.0 NaN NaN NaN NaN ... NaN 37.746 0.5 64.83 0.511 41128772.0 NaN NaN NaN NaN
4 AFG Asia Afghanistan 2020-02-28 5.0 0.0 NaN NaN NaN NaN ... NaN 37.746 0.5 64.83 0.511 41128772.0 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
233050 ZWE Africa Zimbabwe 2022-11-02 257893.0 0.0 0.0 5606.0 0.0 0.0 ... 30.7 36.791 1.7 61.49 0.571 16320539.0 NaN NaN NaN NaN
233051 ZWE Africa Zimbabwe 2022-11-03 257893.0 0.0 0.0 5606.0 0.0 0.0 ... 30.7 36.791 1.7 61.49 0.571 16320539.0 NaN NaN NaN NaN
233052 ZWE Africa Zimbabwe 2022-11-04 257893.0 0.0 0.0 5606.0 0.0 0.0 ... 30.7 36.791 1.7 61.49 0.571 16320539.0 NaN NaN NaN NaN
233053 ZWE Africa Zimbabwe 2022-11-05 257893.0 0.0 0.0 5606.0 0.0 0.0 ... 30.7 36.791 1.7 61.49 0.571 16320539.0 NaN NaN NaN NaN
233054 ZWE Africa Zimbabwe 2022-11-06 257893.0 0.0 0.0 5606.0 0.0 0.0 ... 30.7 36.791 1.7 61.49 0.571 16320539.0 NaN NaN NaN NaN

233055 rows × 67 columns

Observing data#

raw.columns
Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred',
       'new_vaccinations_smoothed_per_million',
       'new_people_vaccinated_smoothed',
       'new_people_vaccinated_smoothed_per_hundred', 'stringency_index',
       'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
       'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate',
       'diabetes_prevalence', 'female_smokers', 'male_smokers',
       'handwashing_facilities', 'hospital_beds_per_thousand',
       'life_expectancy', 'human_development_index', 'population',
       'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative',
       'excess_mortality', 'excess_mortality_cumulative_per_million'],
      dtype='object')

計算每個洲(continent)有多少資料。每個洲會高達數萬筆資料,原因是因為每一列是一個國家一天的資料。

print(set(raw.continent))
raw.continent.value_counts()
{nan, 'Oceania', 'Africa', 'South America', 'Asia', 'North America', 'Europe'}
Europe           53357
Africa           52948
Asia             49281
North America    35177
Oceania          16422
South America    12716
Name: continent, dtype: int64

Filtering data#

Since the purpose is to understand the similarities and differences between Taiwan’s and other countries, the following only deals with Asian data, including South Korea, Japan and other countries that deal with the epidemic situation similar to my country’s.

df_asia = raw.loc[raw['continent']=="Asia"]
set(df_asia.location)
{'Afghanistan',
 'Armenia',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Bhutan',
 'Brunei',
 'Cambodia',
 'China',
 'Georgia',
 'Hong Kong',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Israel',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kuwait',
 'Kyrgyzstan',
 'Laos',
 'Lebanon',
 'Macao',
 'Malaysia',
 'Maldives',
 'Mongolia',
 'Myanmar',
 'Nepal',
 'North Korea',
 'Northern Cyprus',
 'Oman',
 'Pakistan',
 'Palestine',
 'Philippines',
 'Qatar',
 'Saudi Arabia',
 'Singapore',
 'South Korea',
 'Sri Lanka',
 'Syria',
 'Taiwan',
 'Tajikistan',
 'Thailand',
 'Timor',
 'Turkey',
 'Turkmenistan',
 'United Arab Emirates',
 'Uzbekistan',
 'Vietnam',
 'Yemen'}
# Using .loc() to filter location == Taiwan
# df_tw = df_asia.loc[df_asia['location'] == "Taiwan"]

# Using pandas.Dataframe.query() function
df_tw = df_asia.query('location == "Taiwan"')
df_tw
iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed ... male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index population excess_mortality_cumulative_absolute excess_mortality_cumulative excess_mortality excess_mortality_cumulative_per_million
203086 TWN Asia Taiwan 2020-01-16 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
203087 TWN Asia Taiwan 2020-01-17 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
203088 TWN Asia Taiwan 2020-01-18 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
203089 TWN Asia Taiwan 2020-01-19 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
203090 TWN Asia Taiwan 2020-01-20 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
204107 TWN Asia Taiwan 2022-11-02 7780125.0 33156.0 32034.429 12929.0 53.0 64.286 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
204108 TWN Asia Taiwan 2022-11-03 7810077.0 29952.0 31219.429 13010.0 81.0 63.857 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
204109 TWN Asia Taiwan 2022-11-04 7837658.0 27581.0 30222.143 13084.0 74.0 66.286 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
204110 TWN Asia Taiwan 2022-11-05 7863193.0 25535.0 29230.429 13151.0 67.0 65.000 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
204111 TWN Asia Taiwan 2022-11-06 7887538.0 24345.0 28204.000 13198.0 47.0 60.857 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN

1026 rows × 67 columns

df_tw.dtypes
iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
population                                 float64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object

Line plot of time series#

由於要以時間(日期)當成X軸來繪圖,所以要先偵測看看目前的日期(date)變數型態為何(由於載下來的資料是CSV,八成是字串,偶而會是整數),所以會需要將日期的字串轉為Python的時間物件datetime

print(type(df_tw.date))
# <class 'pandas.core.series.Series'>

print(df_tw.date.dtype)
# object (str)

# Converting columns to datetime
df_tw['date'] = pd.to_datetime(df_tw['date'], format="%Y-%m-%d")

print(df_tw.date.dtype)
# datetime64[ns]
<class 'pandas.core.series.Series'>
datetime64[ns]
datetime64[ns]
/var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/ipykernel_38668/1951838620.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tw['date'] = pd.to_datetime(df_tw['date'], format="%Y-%m-%d")
df_tw
iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed ... male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index population excess_mortality_cumulative_absolute excess_mortality_cumulative excess_mortality excess_mortality_cumulative_per_million
203086 TWN Asia Taiwan 2020-01-16 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
203087 TWN Asia Taiwan 2020-01-17 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
203088 TWN Asia Taiwan 2020-01-18 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
203089 TWN Asia Taiwan 2020-01-19 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
203090 TWN Asia Taiwan 2020-01-20 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
204107 TWN Asia Taiwan 2022-11-02 7780125.0 33156.0 32034.429 12929.0 53.0 64.286 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
204108 TWN Asia Taiwan 2022-11-03 7810077.0 29952.0 31219.429 13010.0 81.0 63.857 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
204109 TWN Asia Taiwan 2022-11-04 7837658.0 27581.0 30222.143 13084.0 74.0 66.286 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
204110 TWN Asia Taiwan 2022-11-05 7863193.0 25535.0 29230.429 13151.0 67.0 65.000 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN
204111 TWN Asia Taiwan 2022-11-06 7887538.0 24345.0 28204.000 13198.0 47.0 60.857 ... NaN NaN NaN 80.46 NaN 23893396.0 NaN NaN NaN NaN

1026 rows × 67 columns

Plot 1 line by Pandas#

args

df_tw.plot(x="date", y="new_cases", figsize=(10, 5))
<AxesSubplot:xlabel='date'>
../_images/05ace346069af120e390cb0adc56b129b37952d4d42461bd1f4ece87025d0430.png

Plot multiple lines#

要繪製單一變項(一個國家)的折線圖很容易,X軸為日期、Y軸為案例數。但要如何繪製多個國家、多條折線圖(每個國家一條線)?以下就以日本和台灣兩國的數據為例來進行繪製。

location這個欄位紀錄了該列資料屬於日本或台灣。通常視覺化軟體會有兩種作法,一種做法是必須把日本和台灣在欄的方向展開(用df.pivot()),變成兩個變項,日本和台灣各一個變項,Python最基本的繪圖函式庫matplotlib就必須這麼做。但如果用號稱是matplotlib的進階版seaborn,則可以指定location這個變項作為群組資訊,簡單地說是用location當成群組變數來繪製不同的線。

df1 = df_asia.loc[df_asia['location'].isin(["Taiwan", "Japan"])]
df1['date'] = pd.to_datetime(df1['date'], format="%Y-%m-%d")
set(df1.location)
/var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/ipykernel_38668/2904734101.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['date'] = pd.to_datetime(df1['date'], format="%Y-%m-%d")
{'Japan', 'Taiwan'}
df1[['location', 'date', 'new_cases']]
location date new_cases
104856 Japan 2020-01-22 NaN
104857 Japan 2020-01-23 0.0
104858 Japan 2020-01-24 0.0
104859 Japan 2020-01-25 0.0
104860 Japan 2020-01-26 2.0
... ... ... ...
204107 Taiwan 2022-11-02 33156.0
204108 Taiwan 2022-11-03 29952.0
204109 Taiwan 2022-11-04 27581.0
204110 Taiwan 2022-11-05 25535.0
204111 Taiwan 2022-11-06 24345.0

2046 rows × 3 columns

# df1 data contains more than 1 location

df1.plot(x="date", y="new_cases", figsize=(10, 5))
<AxesSubplot:xlabel='date'>
../_images/2b3f91e686581fa46fcc0947e8928f7c4d40c25dfe9be41d5bdbbd64d8d9d608.png
df_wide = df1.pivot(index="date", columns="location", 
                    values=["new_cases", "total_cases", "total_vaccinations_per_hundred"])
df_wide
new_cases total_cases total_vaccinations_per_hundred
location Japan Taiwan Japan Taiwan Japan Taiwan
date
2020-01-16 NaN NaN NaN NaN NaN NaN
2020-01-17 NaN NaN NaN NaN NaN NaN
2020-01-18 NaN NaN NaN NaN NaN NaN
2020-01-19 NaN NaN NaN NaN NaN NaN
2020-01-20 NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ...
2022-11-02 70396.0 33156.0 22460268.0 7780125.0 268.25 264.69
2022-11-03 67473.0 29952.0 22527741.0 7810077.0 268.34 264.83
2022-11-04 34064.0 27581.0 22561805.0 7837658.0 268.56 265.00
2022-11-05 74170.0 25535.0 22635975.0 7863193.0 268.81 NaN
2022-11-06 66397.0 24345.0 22702372.0 7887538.0 268.90 265.15

1026 rows × 6 columns

fillna()#

df_wide.fillna(0, inplace=True)
df_wide.new_cases.Taiwan
df_wide
new_cases total_cases total_vaccinations_per_hundred
location Japan Taiwan Japan Taiwan Japan Taiwan
date
2020-01-16 0.0 0.0 0.0 0.0 0.00 0.00
2020-01-17 0.0 0.0 0.0 0.0 0.00 0.00
2020-01-18 0.0 0.0 0.0 0.0 0.00 0.00
2020-01-19 0.0 0.0 0.0 0.0 0.00 0.00
2020-01-20 0.0 0.0 0.0 0.0 0.00 0.00
... ... ... ... ... ... ...
2022-11-02 70396.0 33156.0 22460268.0 7780125.0 268.25 264.69
2022-11-03 67473.0 29952.0 22527741.0 7810077.0 268.34 264.83
2022-11-04 34064.0 27581.0 22561805.0 7837658.0 268.56 265.00
2022-11-05 74170.0 25535.0 22635975.0 7863193.0 268.81 0.00
2022-11-06 66397.0 24345.0 22702372.0 7887538.0 268.90 265.15

1026 rows × 6 columns

reset_index()#

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html

在經過pivot後,列方向會變成以date為index,此時我希望將data恢復為欄方向的變數,就需要用reset_index()

df_wide.reset_index(inplace=True)
df_wide
date new_cases total_cases total_vaccinations_per_hundred
location Japan Taiwan Japan Taiwan Japan Taiwan
0 2020-01-16 0.0 0.0 0.0 0.0 0.00 0.00
1 2020-01-17 0.0 0.0 0.0 0.0 0.00 0.00
2 2020-01-18 0.0 0.0 0.0 0.0 0.00 0.00
3 2020-01-19 0.0 0.0 0.0 0.0 0.00 0.00
4 2020-01-20 0.0 0.0 0.0 0.0 0.00 0.00
... ... ... ... ... ... ... ...
1021 2022-11-02 70396.0 33156.0 22460268.0 7780125.0 268.25 264.69
1022 2022-11-03 67473.0 29952.0 22527741.0 7810077.0 268.34 264.83
1023 2022-11-04 34064.0 27581.0 22561805.0 7837658.0 268.56 265.00
1024 2022-11-05 74170.0 25535.0 22635975.0 7863193.0 268.81 0.00
1025 2022-11-06 66397.0 24345.0 22702372.0 7887538.0 268.90 265.15

1026 rows × 7 columns

Visualized by matplotlib with pandas#

後面加上figsize參數可以調整長寬比。 pandas.DataFrame.plot的可用參數可見https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html。

df_wide.plot(x="date", y="new_cases", figsize=(10, 5))
<AxesSubplot:xlabel='date'>
../_images/7212a7135e3ac44944aa1fa04fcd203209904ebe04860f50384c206468777520.png

More params#

例如對Y軸取log。 pandas.DataFrame.plot的可用參數可見https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html。

df_wide.plot(x="date", y="new_cases", figsize=(10, 5), logy=True)
<AxesSubplot:xlabel='date'>
../_images/3467c8f49c18f979032a84fc114ccdca7c1378444bf204680aff11992eabc379.png

Visualized by seaborn#

seaborn可以將location作為群組變數,不同組的就繪製在不同的線。

以下先將locationdatenew_cases取出後,把NA值填0。

df1 = df_asia.loc[df_asia['location'].isin(["Taiwan", "Japan", "South Korea"])]
df1['date'] = pd.to_datetime(df1['date'], format="%Y-%m-%d")
df_sns = df1[["location", 'date', 'new_cases']].fillna(0)
df_sns
/var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/ipykernel_38668/2059310855.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['date'] = pd.to_datetime(df1['date'], format="%Y-%m-%d")
location date new_cases
104856 Japan 2020-01-22 0.0
104857 Japan 2020-01-23 0.0
104858 Japan 2020-01-24 0.0
104859 Japan 2020-01-25 0.0
104860 Japan 2020-01-26 2.0
... ... ... ...
204107 Taiwan 2022-11-02 33156.0
204108 Taiwan 2022-11-03 29952.0
204109 Taiwan 2022-11-04 27581.0
204110 Taiwan 2022-11-05 25535.0
204111 Taiwan 2022-11-06 24345.0

3066 rows × 3 columns

Seaborn繪圖還是基於matplotlib套件,但他的lineplot()可以多給一個參數hue,並將location指定給該參數,這樣繪圖時便會依照不同的location進行繪圖。

import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(11, 6))
sns.lineplot(data=df_sns, x='date', y='new_cases', hue='location', ax=ax)
<AxesSubplot:xlabel='date', ylabel='new_cases'>
../_images/f17de11095a299dddbe955c8eb4085c0676b66f32b22f0a6ea5c87d3ade81665.png

Visualized by bokeh: plot_bokeh()#

Bokeh的功能則是可以提供可互動的視覺化。但他不吃Pandas的MultiIndex,所以要將Pandas的階層欄位扁平化。以下是其中一種做法。做完扁平化就可以使用bokeh的函數來進行繪圖。

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
# !pip install pandas_bokeh
import pandas_bokeh
pandas_bokeh.output_notebook()
Collecting pandas_bokeh
  Downloading pandas_bokeh-0.5.5-py2.py3-none-any.whl (29 kB)
Requirement already satisfied: bokeh>=2.0 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from pandas_bokeh) (2.4.2)
Requirement already satisfied: pandas>=0.22.0 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from pandas_bokeh) (1.4.2)
Requirement already satisfied: packaging>=16.8 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (21.3)
Requirement already satisfied: tornado>=5.1 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (6.1)
Requirement already satisfied: pillow>=7.1.0 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (9.0.1)
Requirement already satisfied: Jinja2>=2.9 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (2.11.3)
Requirement already satisfied: PyYAML>=3.10 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (6.0)
Requirement already satisfied: typing-extensions>=3.10.0 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (4.1.1)
Requirement already satisfied: numpy>=1.11.3 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from bokeh>=2.0->pandas_bokeh) (1.21.5)
Requirement already satisfied: MarkupSafe>=0.23 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from Jinja2>=2.9->bokeh>=2.0->pandas_bokeh) (2.0.1)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from packaging>=16.8->bokeh>=2.0->pandas_bokeh) (3.0.4)
Requirement already satisfied: python-dateutil>=2.8.1 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from pandas>=0.22.0->pandas_bokeh) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from pandas>=0.22.0->pandas_bokeh) (2021.3)
Requirement already satisfied: six>=1.5 in /Users/jirlong/opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas>=0.22.0->pandas_bokeh) (1.16.0)
Installing collected packages: pandas-bokeh
Successfully installed pandas-bokeh-0.5.5
Loading BokehJS ...
df_wide2 = df_wide.copy()
df_wide2.columns = df_wide.columns.map('_'.join)
df_wide2
date_ new_cases_Japan new_cases_Taiwan total_cases_Japan total_cases_Taiwan total_vaccinations_per_hundred_Japan total_vaccinations_per_hundred_Taiwan
0 2020-01-16 0.0 0.0 0.0 0.0 0.00 0.00
1 2020-01-17 0.0 0.0 0.0 0.0 0.00 0.00
2 2020-01-18 0.0 0.0 0.0 0.0 0.00 0.00
3 2020-01-19 0.0 0.0 0.0 0.0 0.00 0.00
4 2020-01-20 0.0 0.0 0.0 0.0 0.00 0.00
... ... ... ... ... ... ... ...
1021 2022-11-02 70396.0 33156.0 22460268.0 7780125.0 268.25 264.69
1022 2022-11-03 67473.0 29952.0 22527741.0 7810077.0 268.34 264.83
1023 2022-11-04 34064.0 27581.0 22561805.0 7837658.0 268.56 265.00
1024 2022-11-05 74170.0 25535.0 22635975.0 7863193.0 268.81 0.00
1025 2022-11-06 66397.0 24345.0 22702372.0 7887538.0 268.90 265.15

1026 rows × 7 columns

df_wide2.plot_bokeh(
    kind='line',
    x='date_',
    y=['new_cases_Japan', 'new_cases_Taiwan']
)
Figure(
id = '1003', …)

Bar chart: vaccinating rate#

df_asia.dtypes
iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
population                                 float64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object
df_asia['date'] = pd.to_datetime(df_asia.date)
print(df_asia.date.dtype)
datetime64[ns]
/var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/ipykernel_38668/1918172663.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_asia['date'] = pd.to_datetime(df_asia.date)
max(df_asia.date)
Timestamp('2022-11-07 00:00:00')
import datetime
df_recent = df_asia.loc[df_asia['date'] == datetime.datetime(2021, 10, 28)]

by pure pandas#

# df_recent.columns
df_recent.plot.barh(x="location", y="total_vaccinations_per_hundred")
<AxesSubplot:ylabel='location'>
../_images/1e6dad14c34b0aaa88d850ed2eee982efa98c89f43740ca7c52855f408c7aa9d.png
df_recent.plot.barh(x="location", y="total_vaccinations_per_hundred", figsize=(10, 10))
<AxesSubplot:ylabel='location'>
../_images/73ea1e2012059fca5420e1085819c42db4754f0eb9a3bde4395d5d72dc095528.png
df_recent.sort_values('total_vaccinations_per_hundred', ascending=True).plot.barh(x="location", y="total_vaccinations_per_hundred", figsize=(10, 10))
<AxesSubplot:ylabel='location'>
../_images/82271f3a3d6995b9400ec052525b08467fb560217c7f632700d8fec62f3d4e2a.png
df_recent.fillna(0).sort_values('total_vaccinations_per_hundred', ascending=True).plot.barh(x="location", y="total_vaccinations_per_hundred", figsize=(10, 10))
<AxesSubplot:ylabel='location'>
../_images/e128d7ab226a75a30fbdc04e405df125033c3c407277c1d382f9670b71f61618.png

by plot_bokeh#

toplot = df_recent.fillna(0).sort_values('total_vaccinations_per_hundred', ascending=True)
toplot.plot_bokeh(kind="barh", x="location", y="total_vaccinations_per_hundred")
Figure(
id = '1282', …)

Bokeh Settings#

Displaying output in jupyter notebook#

from bokeh.io import output_notebook
output_notebook()
Loading BokehJS ...

Adjust figure size along with windows size#

plot_df = pd.DataFrame({"x":[1, 2, 3, 4, 5],
                        "y":[1, 2, 3, 4, 5],
                        "freq":[10, 20, 13, 40, 35],
                        "label":["10", "20", "13", "40", "35"]})
plot_df
x y freq label
0 1 1 10 10
1 2 2 20 20
2 3 3 13 13
3 4 4 40 40
4 5 5 35 35
p = figure(title = "TEST")
p.circle(plot_df["x"], plot_df["y"], fill_alpha=0.2, size=plot_df["freq"])
p.sizing_mode = 'scale_width'
show(p)

Color mapper#

Categorical color transforming Manually#

# from bokeh.palettes import Magma, Inferno, Plasma, Viridis, Cividis, d3

# cluster_label = list(Counter(df2plot.cluster).keys())
# color_mapper = CategoricalColorMapper(palette=d3['Category20'][len(cluster_label)], factors=cluster_label)
# p = figure(title = "doc clustering")
# p.sizing_mode = 'scale_width'
# p.circle(x = "x", y = "y", 
#          color={'field': 'cluster', 'transform': color_mapper},
#          source = df2plot, 
#          fill_alpha=0.5, size=5, line_color=None)
# show(p)

Continuous color transforming#

from bokeh.palettes import Magma, Inferno, Plasma, Viridis, Cividis, d3
from bokeh.models import LogColorMapper, LinearColorMapper, LabelSet, ColumnDataSource 


p = figure(title = "ColorMapper Tester")
color_mapper = LinearColorMapper(palette="Plasma256", 
                                 low = min(plot_df["freq"]), 
                                 high = max(plot_df["freq"]))

source = ColumnDataSource(plot_df)
p.circle("x", "y", fill_alpha = 0.5, 
         size = "freq", 
         line_color=None,
         source = source,
         fill_color = {'field': 'freq', 'transform': color_mapper}
        )

p.sizing_mode = 'scale_width'

show(p)