1 Recipes in slides

1.1 Foundations

1.1.1 Hardy-Weinberg Equilibrium

Code to generate Hardy-Weinberg equilibrium plot.

Bash script to download data from 1000 genomes
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz.tbi
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel

# Make population-specific sample files
panel=integrated_call_samples_v3.20130502.ALL.panel
awk '{if ($2 == "CEU") print $1}' $panel > CEU.samples.txt
awk '{if ($2 == "CHB") print $1}' $panel > CHB.samples.txt
awk '{if ($2 == "YRI") print $1}' $panel > YRI.samples.txt

# Subset vcf to these files
VCF=ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz
REGION=22:17000000-20000000
for pop in CEU CHB YRI; do
    echo $pop
    bcftools view -v snps -m 2 -M 2 -S $pop.samples.txt $VCF $REGION | bgzip -c > $pop.$REGION.vcf.gz
    tabix -f $pop.$REGION.vcf.gz
    plink --vcf $pop.$REGION.vcf.gz --hardy --out $pop.$REGION
    gzip -v $pop.${REGION}.hwe
done
R code to plot HWE proportions
# Read genotypes from files generated by hwe.sh
pops <- c("CEU", "CHB", "YRI")
fn <- file.path("data/Homo_sapiens/1000g/genotypes", paste0(pops, ".22:17000000-20000000.hwe.gz"))
names(fn) <- pops
# Make data frame from all populations
x <- do.call("rbind", lapply(pops, function(p) {
    y <- cbind(population = p, read.table(fn[[p]], header = TRUE))
    # GENO column holds genotypes as minor/het/major counts.
    geno <- do.call("rbind", lapply(strsplit(y$GENO, "/"), as.numeric))
    # Find n that splits geno in two equally sized lists
    n <- round(nrow(geno)/2)
    # Because minor is always less than 50, we need to reverse the order of
    # half of the genotypes to produce nicer plots
    geno <- rbind(geno[1:n, ], geno[(n + 1):nrow(geno), 3:1])
    colnames(geno) <- c("AA.cnt", "Aa.cnt", "aa.cnt")
    # Normalize counts to frequencies
    geno_frq <- as.data.frame(geno/apply(geno, 1, sum))
    colnames(geno_frq) <- c("AA", "Aa", "aa")
    # Calculate allele frequencies under HWE assumption
    allele_frq <- as.data.frame(cbind(geno_frq$AA + 0.5 * geno_frq$Aa, geno_frq$aa +
        0.5 * geno_frq$Aa))
    colnames(allele_frq) <- c("p", "q")
    # Make one large table with genotypes, genotype frequencies, and allele
    # frequencies
    cbind(y, geno, geno_frq, allele_frq)
}))
# Pivot genotypes to value column
x <- pivot_longer(x, cols = c("AA", "Aa", "aa"), names_to = "genotype")
p <- seq(0, 1, 0.01)
n <- 10000
# Sample 10000 snps from each population
z <- do.call("rbind", by(x, x$population, function(y) {
    y[sample(nrow(y), 10000), ]
}))
z$population <- factor(z$population)
levels(z$population) <- c("European", "Chinese", "Yoruban")

# Plotting colors
vend <- 0.8
cols_hwe <- c(viridis_pal(end = vend)(3), "black", "black")
lt_hwe <- c(rep("blank", 3), "dashed", "solid")
shape_hwe <- c(rep(16, 3), NA, NA)
cnames <- c("aa", "Aa", "AA", "Hardy Weinberg Equilibrium", "Mean")
names(cols_hwe) <- cnames
names(lt_hwe) <- cnames
names(shape_hwe) <- cnames

# Make plot
ggplot(z, aes(x = p, y = value)) + geom_point(size = 2, alpha = 0.6, aes(color = genotype)) +
    geom_smooth(method = "loess", linewidth = 1, show.legend = TRUE, lty = "dashed",
        aes(group = genotype, color = "Mean")) + geom_line(inherit.aes = FALSE, aes(x = p,
    y = p^2, color = "Hardy Weinberg Equilibrium")) + geom_line(inherit.aes = FALSE,
    aes(x = p, y = 2 * p * (1 - p), color = "Hardy Weinberg Equilibrium")) + geom_line(inherit.aes = FALSE,
    aes(x = p, y = (1 - p)^2, color = "Hardy Weinberg Equilibrium")) + xlab("allele frequency") +
    ylab("genotype frequency") + facet_grid(. ~ population) + scale_color_viridis_d(end = vend) +
    scale_color_manual(name = NULL, values = cols_hwe, guide = guide_legend(override.aes = list(linetype = lt_hwe,
        shape = shape_hwe)))

1.1.2 Genetic drift as allele frequency distributions

R code to generate and plot allele frequency distribution
# Recipe to generate genetic drift histogram. The procedure consists of
# tracking a fixed number of allelic states for a system with two alleles (say,
# a and A), where the variable x corresponds to the *probability* of being in a
# particular state. Default setting reproduce the data in Figure 3.4 in The
# neutral theory of molecular evolution (Kimura, 1983).

library(expm)  # Matrix exponential library

# Number of possible allelic states. Setting this too large results in a
# prohibitively large transition matrix
n <- 10

# The x vector traces the *probability* of samples/sequences in a given allelic
# state, where x[1]=probability of fixation for allele a, x[n+1]=probability of
# fixation for allele A, x[i]=i/n, i=2,..., n, probability of being in state i
# (i alleles of type a, n-i alleles of type A)
x <- vector(mode = "numeric", length = n + 1)  # Init x to vector of 0's

# Start with state n_a = n_A; find the index corresponding to the state with
# number of a alleles = number of A alleles and set to 1
n_a <- ceiling(length(x)/2)
x[n_a] <- 1

# States 1, n+1 are absorbing states (alleles are fixed)
class <- c("absorbing", rep("normal", n - 1), "absorbing")
class_col <- c("black", rep("white", n - 1), "black")

# Make transition matrix, where an entry p_ij states the probability of
# transitioning from a state (i, n-i) with i alleles of type a, (n-i) alleles
# of type A, to state (j, n-j)
transmat <- do.call("rbind", lapply(0:n, function(i) {
    dbinom(0:n, n, i/n)
}))

# Number of generations
t <- 30

# The distribution at any time point is given as the initial distribution x
# times the transition matrix to the power of t
data <- t(x %*% (transmat %^% t))

# Make labels corresponding to the fraction of a alleles
xlabels <- unlist(lapply(0:n, function(x) {
    sprintf("frac(%i, %i)", x, n)
}))
xl <- do.call(c, lapply(xlabels, function(l) {
    parse(text = l)
}))
# Plot the results
barplot(height = data, beside = TRUE, col = class_col, names.arg = xl)

1.1.3 dn/ds ratios of human versus rat

R code to download dn/ds data for H.sapiens versus R.norvegicus from ensembl and plot
fn <- "assets/data/hsapiens_rat.dnds.tsv"
if (!file.exists(fn)) {
    library(biomaRt)
    ensembl <- useEnsembl(version = 99, biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
    genes <- getBM(mart = ensembl, attributes = c("ensembl_gene_id"))
    hsapiens_rat <- getBM(attributes = c("ensembl_gene_id", "rnorvegicus_homolog_ensembl_gene",
        "rnorvegicus_homolog_dn", "rnorvegicus_homolog_ds", "rnorvegicus_homolog_orthology_type"),
        filters = "ensembl_gene_id", values = genes$ensembl_gene_id, mart = ensembl)
    hsapiens_rat <- subset(hsapiens_rat, !is.na(rnorvegicus_homolog_ds) & !is.na(rnorvegicus_homolog_dn) &
        rnorvegicus_homolog_orthology_type == "ortholog_one2one")
    hsapiens_rat$dnds <- hsapiens_rat$rnorvegicus_homolog_dn/hsapiens_rat$rnorvegicus_homolog_ds
    hsapiens_rat <- subset(hsapiens_rat, !is.nan(dnds))
    write.table(hsapiens_rat, file = "assets/data/hsapiens_rat.dnds.tsv", sep = "\t",
        quote = FALSE, row.names = FALSE)
}
x <- read.table(fn, header = TRUE)
ggplot(x, aes(x = dnds)) + geom_histogram(bins = 100, aes(y = ..count../sum(..count..)),
    fill = "white", color = "black") + xlim(0, 2) + xlab(expression(d[n] ~ "/" ~
    d[s])) + ylab("Frequency") + ggtitle(paste0(nrow(x), " human-rat orthologue pairs"))

1.2 Simulation

1.2.1 msprime trees

Python code to generate four coalescent trees under different evolutionary scenarios.
import msprime


def tree_topology(
    model,
    *,
    svgid,
    size=(300, 500),
    x_axis=False,
    node_labels={},
    symbol_size=0,
    style=".edge {stroke-width: 2px}",
):
    """Plot tree topology under different evolutionary scenarios"""
    kwargs = dict(
        size=size,
        x_axis=x_axis,
        node_labels=node_labels,
        symbol_size=symbol_size,
        style=f"#{svgid} {{ {style} }}",
        root_svg_attributes={"id": svgid},
    )
    if model == "neutral":
        ts = msprime.sim_ancestry(10, random_seed=12)
    elif model == "expansion":
        demography = msprime.Demography()
        demography.add_population(name="A", initial_size=10_000, growth_rate=0.1)
        ts = msprime.sim_ancestry(
            samples={"A": 10}, demography=demography, random_seed=12
        )
    elif model == "bottleneck":
        demography = msprime.Demography()
        demography.add_population(name="A", initial_size=1_000)
        demography.add_instantaneous_bottleneck(time=100, strength=1000, population=0)
        ts = msprime.sim_ancestry(
            samples={"A": 10}, demography=demography, random_seed=12
        )
    elif model == "selection":
        Ne = 1_000
        L = 1e6
        sweep_model = msprime.SweepGenicSelection(
            position=L / 2,
            start_frequency=1.0 / (2 * Ne),
            end_frequency=1.0 - (1.0 / (2 * Ne)),
            s=0.25,
            dt=1e-6,
        )
        ts = msprime.sim_ancestry(
            10,
            model=[sweep_model, msprime.StandardCoalescent()],
            population_size=Ne,
            sequence_length=L,
            random_seed=119,
        )
    return ts.draw_svg(**kwargs)

1.3 Genetic diversity

1.3.1 Trees

Python code to generate example coalescent trees.
import msprime
import pandas as pd

# Small tree for calculation purposes
ts_small = msprime.sim_ancestry(2, sequence_length=1e4, random_seed=38)
ts_small_mut = msprime.sim_mutations(ts_small, rate=3e-5, random_seed=4)

# Balancing selection topology
ts_small_balance = msprime.sim_ancestry(2, sequence_length=1e4, random_seed=7)
ts_small_balance_mut = msprime.sim_mutations(
    ts_small_balance, rate=2.5e-5, random_seed=7
)


def genotypes(ts, *, sample_names=True, sample_prefix="sample ", collapse=True):
    gt = ts.genotype_matrix().T
    x = pd.DataFrame(gt)
    if collapse:
        x = x.apply(lambda row: "".join(row.values.astype(str)), axis=1)
    if sample_names:
        x.index = [f"{sample_prefix}{i}" for i in x.index]
    return x


def plot_tree(
    ts, *, id_string, show_sequences=True, node_labels=dict(), size=(250, 300), **kwargs
):
    css_extra = kwargs.pop("css_extra", "")
    css_string = (
        ".edge {stroke: black; stroke-width: 3px}"
        ".node > .lab {text-anchor: end; transform: rotate(-45deg) translate(-5px, 8px);}"
        ".node > .sym, .node:not(.leaf) > .lab {display: none}"
        ".node:not(.leaf) > .lab {display: none}"
    ) + css_extra
    css = kwargs.pop("css_string", css_string)
    node_labels = kwargs.pop("node_labels", None)
    if show_sequences:
        node_labels = dict()
        for ind in range(ts.num_samples):
            node_labels[ind] = f"{ind}: " + "".join(
                [str(x) for x in ts.genotype_matrix()[:, ind]]
            )
    style = f"#{id_string} {{" + css + "}"
    return ts.draw_svg(
        size=size,
        node_labels=node_labels,
        style=style,
        root_svg_attributes={"id": id_string},
        **kwargs,
    )