In this extra material, we will look into the use of python as a base programming language instead of R to recreate some of the plots we have create in this course. We will use polars as the dataframe manipulation library and seaborn for plotting with numpy and palmerpenguins as extra libraries to do these following exercises.

You could install these libraries using pip, conda or other means you prefer. For example, using pip you can run the following command in your terminal:

#| eval: false
pip install polars seaborn numpy palmerpenguins

Then you can open a Jupyter or Quarto notebook and start using these libraries to manipulate data and create plots.

1 Dataframes

import polars as pl
from palmerpenguins import load_penguins

penguins = pl.from_pandas( load_penguins() )
penguins.head()

shape: (5, 8)

species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	year
str	str	f64	f64	f64	f64	str	i64
"Adelie"	"Torgersen"	39.1	18.7	181.0	3750.0	"male"	2007
"Adelie"	"Torgersen"	39.5	17.4	186.0	3800.0	"female"	2007
"Adelie"	"Torgersen"	40.3	18.0	195.0	3250.0	"female"	2007
"Adelie"	"Torgersen"	null	null	null	null	null	2007
"Adelie"	"Torgersen"	36.7	19.3	193.0	3450.0	"female"	2007

First figure in python

1.1 From a file

gc_raw = pl.read_csv("../data/counts_raw.txt", has_header=True, separator="\t")
gc_raw.head()

shape: (5, 13)

Gene	Sample_1	Sample_2	Sample_3	Sample_4	Sample_5	Sample_6	Sample_7	Sample_8	Sample_9	Sample_10	Sample_11	Sample_12
str	i64	i64	i64	i64	i64	i64	i64	i64	i64	i64	i64	i64
"ENSG00000000003"	321	303	204	492	455	359	376	523	450	950	760	1436
"ENSG00000000005"	0	0	0	0	0	0	0	0	0	0	0	0
"ENSG00000000419"	696	660	472	951	963	689	706	1041	796	1036	789	1413
"ENSG00000000457"	59	54	44	109	73	66	60	125	74	108	115	174
"ENSG00000000460"	399	405	236	445	454	374	316	505	398	141	168	259

1.2 Wide to Long format

gc_raw_long = gc_raw.unpivot(index=["Gene"], variable_name="Sample_ID", value_name="counts")
gc_raw_long.head()

shape: (5, 3)

Gene	Sample_ID	counts
str	str	i64
"ENSG00000000003"	"Sample_1"	321
"ENSG00000000005"	"Sample_1"	0
"ENSG00000000419"	"Sample_1"	696
"ENSG00000000457"	"Sample_1"	59
"ENSG00000000460"	"Sample_1"	399

1.3 Joining metadata

md = pl.read_csv("../data/metadata.csv", has_header=True, separator=";")
md.head()

shape: (5, 5)

Sample_ID	Sample_Name	Time	Replicate	Cell
str	str	str	str	str
"Sample_1"	"t0_A"	"t0"	"A"	"A431"
"Sample_2"	"t0_B"	"t0"	"B"	"A431"
"Sample_3"	"t0_C"	"t0"	"C"	"A431"
"Sample_4"	"t2_A"	"t2"	"A"	"A431"
"Sample_5"	"t2_B"	"t2"	"B"	"A431"

gc_raw_all = gc_raw_long.join(md, on="Sample_ID", how="left")
gc_raw_all.head()

shape: (5, 7)

Gene	Sample_ID	counts	Sample_Name	Time	Replicate	Cell
str	str	i64	str	str	str	str
"ENSG00000000003"	"Sample_1"	321	"t0_A"	"t0"	"A"	"A431"
"ENSG00000000005"	"Sample_1"	0	"t0_A"	"t0"	"A"	"A431"
"ENSG00000000419"	"Sample_1"	696	"t0_A"	"t0"	"A"	"A431"
"ENSG00000000457"	"Sample_1"	59	"t0_A"	"t0"	"A"	"A431"
"ENSG00000000460"	"Sample_1"	399	"t0_A"	"t0"	"A"	"A431"

2 Seaborn

import seaborn as sns
sns.scatterplot(penguins, x="bill_length_mm", y="flipper_length_mm", hue="species")

sns.regplot(penguins, x="bill_length_mm", y="flipper_length_mm")

sns.lmplot(penguins, x="bill_length_mm", y="flipper_length_mm", hue="species")

sns.lmplot(penguins, x="bill_length_mm", y="flipper_length_mm", hue="species", palette='Set2')

custom_palette = ['#FF6347', '#4682B4', '#8A2BE2', '#FFD700']
sns.lmplot(penguins, x="bill_length_mm", y="flipper_length_mm", hue="species", palette=custom_palette)

import numpy as np
gc_raw_log = gc_raw_all.with_columns([
    ( pl.col("counts").log1p() / np.log(10) ).alias("log10_counts")
])
sns.boxplot(data=gc_raw_log, x="Sample_Name", y="log10_counts", hue="Time")

2.1 move legend

p1 = sns.boxplot(data=gc_raw_log, x="Sample_Name", y="log10_counts", hue="Time")
sns.move_legend(p1, "upper left", bbox_to_anchor=(1, 0.75))

2.2 Histogram

sns.histplot(data=penguins, x="flipper_length_mm")

sns.histplot(data=penguins, x="flipper_length_mm", kde=True)

sns.histplot(data=penguins, x="flipper_length_mm", hue="species", multiple="stack")

2.3 Faceting

g = sns.FacetGrid(gc_raw_log, col="Time", hue="Time")
g.map_dataframe(sns.boxplot, "Sample_Name", "log10_counts")

g = sns.FacetGrid(gc_raw_log, col="Time", hue="Time", sharex=False)
g.map_dataframe(sns.boxplot, "Sample_Name", "log10_counts")

2.3.1 2-dimensional faceting

g = sns.FacetGrid(penguins, col="island", row="species")
g.map_dataframe(sns.scatterplot, "bill_length_mm", "bill_depth_mm")

2.3.2 PairGrid

g = sns.PairGrid(penguins, hue="species")
g.map(sns.scatterplot)

let us remove the year column as it is not numeric.

penguins_no_year = penguins.drop("year")
g = sns.PairGrid(penguins_no_year, hue="species")
g.map(sns.scatterplot)

g = sns.PairGrid(penguins_no_year, hue="species")
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)

g = sns.PairGrid(penguins_no_year, hue="species")
g.map_diag(sns.histplot, kde=True)
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)

PairGrid is very flexible and allows you to use different types of plots on the diagonal, upper and lower triangle of the grid. You can also customize the appearance of the plots using the map method.

There is also a higher-level interface called pairplot that provides a simpler way to create pairwise plots. It is essentially a wrapper around PairGrid that uses scatterplots for the off-diagonal and histograms for the diagonal by default.

However, pairplot needs the data to be in pandas DataFrame format

sns.pairplot(penguins_no_year.to_pandas(), hue="species")

2.4 Text annotations

import seaborn.objects as so
(
    so.Plot(penguins, x="bill_length_mm", y="flipper_length_mm", color="species")
    .add(so.Dot())
)

(
    so.Plot(penguins, x="bill_length_mm", y="flipper_length_mm", color="species", text="sex")
    .add(so.Dot())
    .add(so.Text())
)

def std_err(col: pl.Expr) -> pl.Expr:
    return ((col.var() / col.count()).sqrt())

gc_raw_log_1 = gc_raw_log.group_by('Time').agg(pl.col('log10_counts').mean().alias('mean'), std_err(pl.col('log10_counts')).alias('std_err'))

gc_raw_log_1

shape: (4, 3)

Time	mean	std_err
str	f64	f64
"t0"	0.560369	0.002369
"t2"	0.605195	0.002484
"t6"	0.605814	0.002465
"t24"	0.674977	0.002619