@article {Qin690,
author = {Qin, Chongli and Colwell, Lucy J.},
title = {Power law tails in phylogenetic systems},
volume = {115},
number = {4},
pages = {690--695},
year = {2018},
doi = {10.1073/pnas.1711913115},
publisher = {National Academy of Sciences},
abstract = {Covariance analysis of protein sequence alignments can predict structure and function from sequence alignments alone. Current methodologies typically assume that sequences are independent, notwithstanding their phylogenetic relationships. This corruption constrains the alignments for which covariance analysis can be used. It is critically important to control for phylogeny and understand how phylogeny contaminates signal. This paper presents a mathematical analysis that argues that there is a distinctive signature of phylogeny in the covariance matrix, allowing us to identify modes that are corrupted by phylogeny. This signature is present in large protein sequence alignments, explaining recent covariance analyses, and provides an important step toward decoupling phylogenetic effects from biologically meaningful interactions.Covariance analysis of protein sequence alignments uses coevolving pairs of sequence positions to predict features of protein structure and function. However, current methods ignore the phylogenetic relationships between sequences, potentially corrupting the identification of covarying positions. Here, we use random matrix theory to demonstrate the existence of a power law tail that distinguishes the spectrum of covariance caused by phylogeny from that caused by structural interactions. The power law is essentially independent of the phylogenetic tree topology, depending on just two parameters{\textemdash}the sequence length and the average branch length. We demonstrate that these power law tails are ubiquitous in the large protein sequence alignments used to predict contacts in 3D structure, as predicted by our theory. This suggests that to decouple phylogenetic effects from the interactions between sequence distal sites that control biological function, it is necessary to remove or down-weight the eigenvectors of the covariance matrix with largest eigenvalues. We confirm that truncating these eigenvectors improves contact prediction.},
issn = {0027-8424},
URL = {https://www.pnas.org/content/115/4/690},
eprint = {https://www.pnas.org/content/115/4/690.full.pdf},
journal = {Proceedings of the National Academy of Sciences}
}