diff --git a/Makefile b/Makefile index 7bfeb7f..32c32a3 100644 --- a/Makefile +++ b/Makefile @@ -2,11 +2,11 @@ all : popSimManu.pdf response-to-reviewers.pdf popSimManu.pdf: popSimManu.tex model_table.tex popSim.bib -popSimManu-diff-to-submission.tex : popSimManu.tex +popSimManu-diff-to-submission.tex : popSimManu.tex latexdiff-git -r 8d336be37e4b1261dcc442c41b61585167c89cfd popSimManu.tex mv popSimManu-diff8d336be37e4b1261dcc442c41b61585167c89cfd.tex $@ -popSimManu-diff-to-master.tex : popSimManu.tex +popSimManu-diff-to-master.tex : popSimManu.tex latexdiff-git -r master popSimManu.tex mv popSimManu-diffmaster.tex $@ diff --git a/model_table.tex b/model_table.tex index cc29b06..cb78cc0 100644 --- a/model_table.tex +++ b/model_table.tex @@ -10,36 +10,40 @@ \midrule \multicolumn{6}{l}{ HomSap (\emph{Homo sapiens}) } \\ & -Africa\_1T12& \cite{tennessen2012evolution} & 10.2& 191.3& 23.3\\ +Africa\_1T12& \cite{tennessen2012evolution} & 10.4& 193.3& 23.3\\ & -Zigzag\_1S14& \cite{schiffels2014inferring} & 3.4& 103.5& 7.9\\ +Zigzag\_1S14& \cite{schiffels2014inferring} & 3.4& 105.0& 7.9\\ & -OutOfAfrica\_3G09& \cite{gutenkunst2009inferring} & 11.4& 181.6& 21.4\\ +AshkSub\_7G19& \cite{gladstein2019substructured} & 15.7& 215.3& 26.4\\ & -OutOfAfrica\_2T12& \cite{tennessen2012evolution} & 12.4& 200.4& 24.7\\ +OutOfAfrica\_3G09& \cite{gutenkunst2009inferring} & 10.9& 181.3& 21.1\\ & -AncientEurasia\_9K19& \cite{kamm2019efficiently} & 64.8& 303.1& 41.2\\ +OutOfAfrica\_2T12& \cite{tennessen2012evolution} & 11.3& 198.0& 24.1\\ & -AmericanAdmixture\_4B11& \cite{browning2018ancestry} & 10.6& 185.0& 22.3\\ +AncientEurasia\_9K19& \cite{kamm2019efficiently} & 69.4& 304.1& 41.2\\ & -OutOfAfricaArchaicAdmixture\_5R19& \cite{ragsdale2019models} & 9.1& 182.1& 21.7\\ +AmericanAdmixture\_4B11& \cite{browning2018ancestry} & 11.1& 187.3& 22.3\\ +& +PapuansOutOfAfrica\_10J19& \cite{jacobs2019multiple} & 234.7& 526.3& 77.8\\ +& +OutOfAfricaArchaicAdmixture\_5R19& \cite{ragsdale2019models} & 9.6& 184.5& 21.7\\ \midrule \multicolumn{6}{l}{ DroMel (\emph{Drosophila melanogaster}) } \\ & -OutOfAfrica\_2L06& \cite{li2006inferring} & 0.6& 66.7& 1.6\\ +OutOfAfrica\_2L06& \cite{li2006inferring} & 0.6& 68.7& 1.6\\ & -African3Epoch\_1S16& \cite{sheehan2016deep} & 0.5& 58.8& 0.2\\ +African3Epoch\_1S16& \cite{sheehan2016deep} & 0.5& 60.9& 0.2\\ \midrule \multicolumn{6}{l}{ AraTha (\emph{Arabidopsis thaliana}) } \\ & -African2Epoch\_1H18& \cite{huber2018gene} & 379.5& 358.2& 50.7\\ +African2Epoch\_1H18& \cite{huber2018gene} & 434.1& 359.2& 50.7\\ & -African3Epoch\_1H18& \cite{huber2018gene} & 187.1& 399.5& 58.0\\ +African3Epoch\_1H18& \cite{huber2018gene} & 208.6& 400.6& 58.0\\ +& +SouthMiddleAtlas\_1D17& \cite{durvasula2017african} & 159.6& 315.4& 43.1\\ +\midrule +\multicolumn{6}{l}{ PonAbe (\emph{Pongo abelii}) } \\ & -SouthMiddleAtlas\_1D17& \cite{durvasula2017african} & 141.1& 315.8& 43.1\\ -%\midrule -%\multicolumn{6}{l}{ PonPyg (\emph{Pongo pygmaeus}) } \\ -%& -%TwoSpecies\_2L11& \cite{locke2011comparative} & 9.2& 170.0& 15.5\\ +TwoSpecies\_2L11& \cite{locke2011comparative} & 7.4& 170.5& 14.7\\ \bottomrule \end{tabular} diff --git a/popSim.bib b/popSim.bib index f5250d8..5e10d73 100644 --- a/popSim.bib +++ b/popSim.bib @@ -738,3 +738,19 @@ @article {uricchio2014robust eprint = {https://www.genetics.org/content/197/1/221.full.pdf}, journal = {Genetics} } + +@article{gladstein2019substructured, + author = {Gladstein, Ariella L and Hammer, Michael F}, + doi = {10.1093/molbev/msz047}, + editor = {Rogers, RebekahEditor}, + issn = {1537-1719}, + journal = {Molecular Biology and Evolution}, + month = {Mar}, + number = {6}, + pages = {1162--1171}, + publisher = {Oxford University Press (OUP)}, + title = {Substructured Population Growth in the Ashkenazi Jews Inferred with Approximate Bayesian Computation}, + url = {http://dx.doi.org/10.1093/molbev/msz047}, + volume = {36}, + year = {2019} +} diff --git a/popSimManu.tex b/popSimManu.tex index afa439e..2a10a61 100644 --- a/popSimManu.tex +++ b/popSimManu.tex @@ -194,11 +194,11 @@ \section*{Introduction} translating them into input for a simulator, and choosing appropriate values for key population genetic parameters, such as the mutation and recombination rates. -%ACS: This paragraph seems to me to need a more explicit connection to +%ACS: This paragraph seems to me to need a more explicit connection to %the enterprise of providing standardized simulations. It feels a bit tangential %as written. I have tried to edit it accordingly. A related issue is that it has been challenging to assess the degree to which modeling assumptions -and choices of data summaries can affect population genetic inferences. +and choices of data summaries can affect population genetic inferences. Standardized simulations would enable these questions to be systematically examined. Importantly, there are clear examples of different methods yielding fundamentally different conclusions. For example, Markovian coalescent methods applied to human genomes have @@ -210,7 +210,7 @@ \section*{Introduction} performance of the inference. Furthermore, some methods are likely to perform better than others under certain scenarios, but researchers lack principled guidelines for selecting the best method for addressing -their particular questions. The need for +their particular questions. The need for guidance from simulated data will only increase as researchers seek to apply population genetic methods to a growing collection of non-model taxa. @@ -273,12 +273,12 @@ \section*{Introduction} \section*{Results} %ACS: the consortium really isn't the point; let's get right to describing what stdpopsim is -%The first contribution of the PopSim consortium is +%The first contribution of the PopSim consortium is The \stdpopsim library is a a community-maintained collection of empirical genome data and population genetics simulation models, with a structure depicted in Figure \ref{fig:cartoon}. The package centers on a catalog of genomic information and demographic models -for a growing list of species (Fig.~\ref{fig:cartoon}A), +for a growing list of species (Fig.~\ref{fig:cartoon}A), and software resources to facilitate efficient simulations (Fig.~\ref{fig:cartoon}B-C). Given the genome data and simulation model descriptions defined within the @@ -292,11 +292,11 @@ \section*{Results} \citep{kelleher2016efficient,kelleher2018efficient}. The tree sequence format could also be converted to other formats (e.g., VCF) by the user if desired. - + \subsection*{The species catalog} The central feature of \stdpopsim is the species catalog, a systematic organization -of the key quantitative data needed to simulate a given species. +of the key quantitative data needed to simulate a given species. Data is currently available for humans, \textit{D.~melanogaster}, \textit{A.~thaliana}, \textit{Escherichia coli}, and \textit{Canis familiaris}. @@ -332,7 +332,7 @@ \subsection*{The species catalog} simple, single population histories \cite[e.g.,][]{sheehan2016deep}, to complex models which include population splitting, migration, and archaic admixture \cite[e.g.,][]{ragsdale2019models}. -In addition to models from Table \autoref{tab:catalog}, +In addition to models from Table \autoref{tab:catalog}, the PopSim Consortium has demographic models for \textit{Pongo abelii}, \textit{Canis familiaris}, and \textit{Escherichia coli} in development at time of writing. \renewcommand{\arraystretch}{1.2} @@ -350,7 +350,7 @@ \subsection*{The species catalog} In each case, we simulate 100 samples drawn from the first population, for the shortest chromosome of that species and a constant chromosome-specific recombination rate. -The times reported are for a single run on an Intel i5-7600 CPU. +The times reported are for a single run on an Intel i5-7600K CPU. Computing resources required will vary widely depending on sample sizes, chromosome length, recombination rates and other factors. } @@ -386,7 +386,7 @@ \subsection*{The species catalog} %For \emph{E. coli} we have a simple, large constant population size model implemented. In addition to organism-specific models, \stdpopsim also includes a generic piecewise constant size model and -isolation with migration (IM) model which can be used with any genome and genetic map. +isolation with migration (IM) model which can be used with any genome and genetic map. Together these models contain many features believed to affect observed patterns of polymorphism (e.g., bottlenecks, population growth, admixture) and therefore provide useful benchmarks for method development. @@ -406,7 +406,7 @@ \subsection*{Simulation engines} Currently, \stdpopsim uses the \texttt{msprime} coalescent simulator \citep{kelleher2016efficient} as the default simulation engine. Coalescent simulations, while highly efficient, are limited in their ability to model -continuous geography or +continuous geography or complex selection scenarios, such as recurrent sweeps and background selection. For these reasons, we have also implemented the forward-time simulator, \texttt{SLiM} \citep{haller2019tree,haller2019slim}, @@ -434,7 +434,8 @@ \subsection*{Simulation engines} We validated our implementation of the \texttt{SLiM} engine by comparing estimates of several population genetic summary statistics for neutral simulations generated by both \texttt{SLiM} and \texttt{msprime}. -Examples of this validation for the AncientEurasia\_9K19 model \citep{AncientEurasia 9K19} +Examples of this validation for the AncientEurasia\_9K19 model +\citep{kamm2019efficiently} are shown in Figures \ref{fig:slim_val_map} and \ref{fig:slim_val_nomap}. For this model, down-scaling factors of up to $50$ produce patterns of both diversity and linkage disequilibrium that are @@ -483,7 +484,7 @@ \subsection*{Use case: comparing methods of demographic inference} available from \url{https://github.com/popsim-consortium/analysis}, that allow efficient computing in multicore or cluster environments. Our workflow generates $R$ replicates of $C$ chromosomes, -% ACS: "sample" could be confusing because we are both sampling +% ACS: "sample" could be confusing because we are both sampling % by simulation and sampling from a population. Does "population samples" (below) help? producing $n$ population samples in each of a total of $R \times C$ simulations for each demographic model. After simulation, @@ -495,7 +496,7 @@ \subsection*{Use case: comparing methods of demographic inference} Each of the inference programs are then run in parallel, and finally, estimates of population size history from each program are plotted. -% ACS: do we need the "Homo sapiens" at top left in the figure? It breaks the visual symmetry among YRI, CEU, and CHB, and +% ACS: do we need the "Homo sapiens" at top left in the figure? It breaks the visual symmetry among YRI, CEU, and CHB, and % I think it is unnecessary given the population labels and the description in the legend % Also: should we include "N(t)" after "Population size" in the y-axis label, to drive home its meaning (used in the text and legend)? \begin{figure} @@ -526,7 +527,7 @@ \subsubsection*{ Single-population demographic models.} under several of the demographic models described above. However, these experiments raise the question of what to use as the ``true" population sizes in the case of multi-population models with migration. In particular, a simple single-population model that is fit to data simulated -under a multi-population model, +under a multi-population model, is not expected to recover the actual simulated population sizes, because of model misspecification. Instead, we argue that the best one may expect in such a scenario is to infer a model that accurately reflects the @@ -575,7 +576,7 @@ \subsubsection*{ Single-population demographic models.} All methods recover population size within a factor of two of the simulated values, however SMC-based methods % ACS: suggest eliminating this vague conjecture. Which regularization exactly is meant? -%perhaps due to their regularization, +%perhaps due to their regularization, tend to infer sinusoidal patterns of population size even though no such change is present. @@ -652,7 +653,7 @@ \subsubsection*{Multi-population demographic models.} of the strengths and weaknesses of the different methods. For instance, the SFS-based approaches with simple IM models do not capture recent exponential growth in the CEU population, but do consistently recover the -simulated YRI population size history. Moreover, these approaches allow +simulated YRI population size history. Moreover, these approaches allow migration rates to be estimated (Figure \ref{fig:homsap_mig_rates}), and lead to more accurate inferences of divergence times. However, these migration rate estimates are somewhat biased. In contrast, \smcpp is much better at capturing the recent exponential @@ -730,7 +731,7 @@ \section*{Discussion} with the CEU history but tends to underestimate divergence times due to the assumption of no migration. The results for the two-population \textit{D.~melanogaster} model (Figure~\ref{fig:two_popn_fly}) are generally similar. -In these comparisons, \fastsimcoal and \dadi perform almost identically, which is expected because they fit the same models to the same summaries of the data, +In these comparisons, \fastsimcoal and \dadi perform almost identically, which is expected because they fit the same models to the same summaries of the data, differing only in how they calculate model expectations and optimize parameters. All methods for inferring demographic history have strengths and weaknesses \citep[as recently reviewed by][]{beichman2018review}. @@ -864,7 +865,7 @@ \subsection*{Model quality control} The possibility for error and the importance of careful qualty control was inadvertantly illustrated during our own development process: -during the final revisions of this paper, we noticed that the +during the final revisions of this paper, we noticed that the OutOfAfrica\_3G09 model \citep{gutenkunst2009inferring} had not in fact gone through the QC process. The subsequent QC process revealed that in fact our implementation @@ -922,10 +923,10 @@ \subsection*{Workflow for analysis of simulated data} used in empirical population genomic studies \citep[e.g.,][]{danecek20111000,langley2012genomic}. Specifically we masked all regions of \SI{1}{cM} or greater in the lowest 5th percentile of the empirical distribution of recombination, regions which are nearly uniformly absent for empirical analysis. -This approach to masking was chosen to prevent marginal trees with low or no recombination -from biasing the comparisons of demographic inference methods. It should be noted that masking is not implemented -within \texttt{stdpopsim} proper; tree sequences generated by \texttt{stdpopsim} are always raw and unmasked. -This allows users the flexibility to implement masking approaches that are specific to their needs +This approach to masking was chosen to prevent marginal trees with low or no recombination +from biasing the comparisons of demographic inference methods. It should be noted that masking is not implemented +within \texttt{stdpopsim} proper; tree sequences generated by \texttt{stdpopsim} are always raw and unmasked. +This allows users the flexibility to implement masking approaches that are specific to their needs for downstream analysis. Our second task was to explore inference with two-population models diff --git a/src/catalog_table.py b/src/catalog_table.py index 3e8e63e..47cf65d 100644 --- a/src/catalog_table.py +++ b/src/catalog_table.py @@ -38,7 +38,7 @@ def main(): if j < 2: print("&") print("\\\\") - for species_id in ["HomSap", "DroMel", "AraTha", "PonPyg"]: + for species_id in ["HomSap", "DroMel", "AraTha", "PonAbe"]: species = stdpopsim.get_species(species_id) escaped_id = species.id.replace("_", "\\_") species_header = f"{escaped_id} (\\emph{{{species.name}}})" @@ -53,7 +53,6 @@ def main(): file_size = float(row.file_size) / megabyte bibtex = model.citations[0].fetch_bibtex() bibtex_key = get_bibtex_key(bibtex, model_refs) - # bibtex_key = "ragsdale2019models" escaped_id = model.id.replace("_", "\\_") print("&") print(escaped_id, end="& ")