wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gzwget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz.tbiwget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel# Make population-specific sample filespanel=integrated_call_samples_v3.20130502.ALL.panelawk'{if ($2 == "CEU") print $1}'$panel> CEU.samples.txtawk'{if ($2 == "CHB") print $1}'$panel> CHB.samples.txtawk'{if ($2 == "YRI") print $1}'$panel> YRI.samples.txt# Subset vcf to these filesVCF=ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gzREGION=22:17000000-20000000for pop in CEU CHB YRI;doecho$popbcftools view -v snps -m 2 -M 2 -S$pop.samples.txt $VCF$REGION|bgzip-c>$pop.$REGION.vcf.gztabix-f$pop.$REGION.vcf.gzplink--vcf$pop.$REGION.vcf.gz --hardy--out$pop.$REGIONgzip-v$pop.${REGION}.hwedone
R code to plot HWE proportions
# Read genotypes from files generated by hwe.shpops<-c("CEU", "CHB", "YRI")fn<-file.path("data/Homo_sapiens/1000g/genotypes", paste0(pops, ".22:17000000-20000000.hwe.gz"))names(fn)<-pops# Make data frame from all populationsx<-do.call("rbind", lapply(pops, function(p){y<-cbind(population =p, read.table(fn[[p]], header =TRUE))# GENO column holds genotypes as minor/het/major counts.geno<-do.call("rbind", lapply(strsplit(y$GENO, "/"), as.numeric))# Find n that splits geno in two equally sized listsn<-round(nrow(geno)/2)# Because minor is always less than 50, we need to reverse the order of# half of the genotypes to produce nicer plotsgeno<-rbind(geno[1:n, ], geno[(n+1):nrow(geno), 3:1])colnames(geno)<-c("AA.cnt", "Aa.cnt", "aa.cnt")# Normalize counts to frequenciesgeno_frq<-as.data.frame(geno/apply(geno, 1, sum))colnames(geno_frq)<-c("AA", "Aa", "aa")# Calculate allele frequencies under HWE assumptionallele_frq<-as.data.frame(cbind(geno_frq$AA+0.5*geno_frq$Aa, geno_frq$aa+0.5*geno_frq$Aa))colnames(allele_frq)<-c("p", "q")# Make one large table with genotypes, genotype frequencies, and allele# frequenciescbind(y, geno, geno_frq, allele_frq)}))# Pivot genotypes to value columnx<-pivot_longer(x, cols =c("AA", "Aa", "aa"), names_to ="genotype")p<-seq(0, 1, 0.01)n<-10000# Sample 10000 snps from each populationz<-do.call("rbind", by(x, x$population, function(y){y[sample(nrow(y), 10000), ]}))z$population<-factor(z$population)levels(z$population)<-c("European", "Chinese", "Yoruban")# Plotting colorsvend<-0.8cols_hwe<-c(viridis_pal(end =vend)(3), "black", "black")lt_hwe<-c(rep("blank", 3), "dashed", "solid")shape_hwe<-c(rep(16, 3), NA, NA)cnames<-c("aa", "Aa", "AA", "Hardy Weinberg Equilibrium", "Mean")names(cols_hwe)<-cnamesnames(lt_hwe)<-cnamesnames(shape_hwe)<-cnames# Make plotggplot(z, aes(x =p, y =value))+geom_point(size =2, alpha =0.6, aes(color =genotype))+geom_smooth(method ="loess", linewidth =1, show.legend =TRUE, lty ="dashed",aes(group =genotype, color ="Mean"))+geom_line(inherit.aes =FALSE, aes(x =p, y =p^2, color ="Hardy Weinberg Equilibrium"))+geom_line(inherit.aes =FALSE,aes(x =p, y =2*p*(1-p), color ="Hardy Weinberg Equilibrium"))+geom_line(inherit.aes =FALSE,aes(x =p, y =(1-p)^2, color ="Hardy Weinberg Equilibrium"))+xlab("allele frequency")+ylab("genotype frequency")+facet_grid(.~population)+scale_color_viridis_d(end =vend)+scale_color_manual(name =NULL, values =cols_hwe, guide =guide_legend(override.aes =list(linetype =lt_hwe, shape =shape_hwe)))
1.1.2 Genetic drift as allele frequency distributions
R code to generate and plot allele frequency distribution
# Recipe to generate genetic drift histogram. The procedure consists of# tracking a fixed number of allelic states for a system with two alleles (say,# a and A), where the variable x corresponds to the *probability* of being in a# particular state. Default setting reproduce the data in Figure 3.4 in The# neutral theory of molecular evolution (Kimura, 1983).library(expm)# Matrix exponential library# Number of possible allelic states. Setting this too large results in a# prohibitively large transition matrixn<-10# The x vector traces the *probability* of samples/sequences in a given allelic# state, where x[1]=probability of fixation for allele a, x[n+1]=probability of# fixation for allele A, x[i]=i/n, i=2,..., n, probability of being in state i# (i alleles of type a, n-i alleles of type A)x<-vector(mode ="numeric", length =n+1)# Init x to vector of 0's# Start with state n_a = n_A; find the index corresponding to the state with# number of a alleles = number of A alleles and set to 1n_a<-ceiling(length(x)/2)x[n_a]<-1# States 1, n+1 are absorbing states (alleles are fixed)class<-c("absorbing", rep("normal", n-1), "absorbing")class_col<-c("black", rep("white", n-1), "black")# Make transition matrix, where an entry p_ij states the probability of# transitioning from a state (i, n-i) with i alleles of type a, (n-i) alleles# of type A, to state (j, n-j)transmat<-do.call("rbind", lapply(0:n, function(i){dbinom(0:n, n, i/n)}))# Number of generationst<-30# The distribution at any time point is given as the initial distribution x# times the transition matrix to the power of tdata<-t(x%*%(transmat%^%t))# Make labels corresponding to the fraction of a allelesxlabels<-unlist(lapply(0:n, function(x){sprintf("frac(%i, %i)", x, n)}))xl<-do.call(c, lapply(xlabels, function(l){parse(text =l)}))# Plot the resultsbarplot(height =data, beside =TRUE, col =class_col, names.arg =xl)
1.1.3 dn/ds ratios of human versus rat
R code to download dn/ds data for H.sapiens versus R.norvegicus from ensembl and plot