diff --git a/.gitignore b/.gitignore index 47896a8..67e7dbd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,19 @@ -*.Rproj +.DS_Store .Rproj.user .Rhistory .RData .Ruserdata .vscode +*.Rproj +RcppTskit/**/.quarto/ +RcppTskit/inst/doc RcppTskit/src/*.o RcppTskit/src/tskit/*.o RcppTskit/src/*.so RcppTskit/src/*.dll* RcppTskit/src/Makevars RcppTskit/src/Makevars.win +RcppTskit/vignettes/*.html +RcppTskit/vignettes/*.R +RcppTskit/vignettes/*_files test.trees -.DS_Store -*.html diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..3fbcd74 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,126 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at gregor.gorjanc@gmail.com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion]. + +For answers to common questions about this code of conduct, see the FAQ at +. Translations are available at . + +[homepage]: https://www.contributor-covenant.org diff --git a/README.md b/README.md index 4a23a97..802ba14 100644 --- a/README.md +++ b/README.md @@ -9,17 +9,15 @@ Tskit provides Python, C, and Rust APIs. The Python API can be called from R via the `reticulate` R package to seamlessly load and analyse a tree sequence, as described at https://tskit.dev/tutorials/RcppTskit.html. `RcppTskit` provides R access to the `tskit` C API for use cases where the -`reticulate` approach is not optimal. For example, for high-performance and -low-level work with tree sequences. Currently, `RcppTskit` provides a very limited +`reticulate` option is not optimal. For example, for high-performance and +low-level work with tree sequences. Currently, `RcppTskit` provides a limited number of R functions due to the availability of extensive Python API and -the `reticulate` approach. +the `reticulate` option. -See more details on the state of the tree sequence ecosystem and aims for -`RcppTskit` in [RcppTskit/inst/STATE_and_AIMS.md](RcppTskit/inst/STATE_and_AIMS.md), -including examples on how to use it on its own or to develop new R packages. - -TODO: Think how to best point to use cases. Probably best to point to vignette and pkgdown!? - https://github.com/HighlanderLab/RcppTskit/issues/10 +See more details on the state of the tree sequence ecosystem and aims of +`RcppTskit` in [RcppTskit/vignettes/RcppTskit_intro.qmd](RcppTskit/vignettes/RcppTskit_intro.qmd). +The vignette also shows examples on how to use `RcppTskit` on its own or +to develop new R packages. ## Status @@ -31,13 +29,17 @@ TODO: Think how to best point to use cases. Probably best to point to vignette a [![CRAN version](https://www.r-pkg.org/badges/version/RcppTskit)](https://CRAN.R-project.org/package=RcppTskit) ![GitHub version (main)](https://img.shields.io/github/r-package/v/HighlanderLab/RcppTskit/main?filename=RcppTskit%2FDESCRIPTION&label=Github) [![Downloads - total](https://cranlogs.r-pkg.org/badges/grand-total/RcppTskit)](https://cranlogs.r-pkg.org/badges/grand-total/RcppTskit) -[![CRAN R CMD check](https://cranchecks.info/badges/summary/RcppTskit)](https://cran.r-project.org/web/checks/check_results_RcppTskit.html) ![GitHub R CMD check](https://img.shields.io/github/actions/workflow/status/HighlanderLab/RcppTskit/R-CMD-check.yaml?label=GitHub%20R%20CMD%20check) [![Codecov test coverage](https://codecov.io/gh/HighlanderLab/RcppTskit/graph/badge.svg)](https://app.codecov.io/gh/HighlanderLab/RcppTskit) +[![CRAN R CMD check](https://cranchecks.info/badges/summary/RcppTskit)](https://cran.r-project.org/web/checks/check_results_RcppTskit.html) [![GitHub R CMD check](https://img.shields.io/github/actions/workflow/status/HighlanderLab/RcppTskit/R-CMD-check.yaml?label=GitHub%20R%20CMD%20check)](https://github.com/HighlanderLab/RcppTskit/actions/workflows/R-CMD-check.yaml) [![Codecov test coverage](https://codecov.io/gh/HighlanderLab/RcppTskit/graph/badge.svg)](https://app.codecov.io/gh/HighlanderLab/RcppTskit) ## Contents - * `extern` - Git submodule for `tskit` and instructions on obtaining the latest version and copying the `tskit` C code into `RcppTskit` directory. `extern` is saved outside of the `RcppTskit` directory because `R CMD CHECK` complains otherwise. + * `extern` - Git submodule for `tskit` and instructions on + obtaining the latest version and copying the `tskit` C code into + `RcppTskit` directory. + `extern` is saved outside of the `RcppTskit` directory + because `R CMD CHECK` complains otherwise. * `RcppTskit` - R package `RcppTskit`. @@ -67,22 +69,26 @@ https://mac.r-project.org/tools for macOS tools. ``` # install.packages("remotes") # If you don't have it already -# Release (TODO) +# Release # TODO: Tag a release #15 # https://github.com/HighlanderLab/RcppTskit/issues/15 # remotes::install_github("HighlanderLab/RcppTskit/RcppTskit") # Main branch -remotes::install_github("HighlanderLab/RcppTskit/RcppTskit") +remotes::install_github("HighlanderLab/RcppTskit/RcppTskit@main") # Development branch -# TODO: Create a devel branch #16 -# https://github.com/HighlanderLab/RcppTskit/issues/16 -# remotes::install_github("HighlanderLab/RcppTskit/RcppTskit@devel") +remotes::install_github("HighlanderLab/RcppTskit/RcppTskit@devel") ``` ## Development +### Code of Conduct + +Please note that the `RcppTskit` project is released with a +[Contributor Code of Conduct](https://contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html). +By contributing to this project, you agree to abide by its terms. + ### Clone First clone the repository: @@ -93,7 +99,8 @@ git clone https://github.com/HighlanderLab/RcppTskit.git ### Pre-commit install -We use [pre-commit](https://pre-commit.com) hooks to ensure code quality. Specifically, we use: +We use [pre-commit](https://pre-commit.com) hooks to ensure code quality. +Specifically, we use: * [air](https://github.com/posit-dev/air) to format R code, * [jarl](https://github.com/etiennebacher/jarl) to lint R code, * [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to format C/C++ code, and @@ -111,7 +118,8 @@ If you plan to update `tskit`, follow instructions in `extern/README.md`. ### RcppTskit -Then open `RcppTskit` package directory in your favourite R IDE (Positron, RStudio, text-editor-of-your-choice, etc.) and implement your changes. +Then open `RcppTskit` package directory in your favourite R IDE +(Positron, RStudio, text-editor-of-your-choice, etc.) and implement your changes. You should routinely check your changes (in R): @@ -161,6 +169,8 @@ pre-commit run --all-files ### Continuous integration -We use Github Actions to run continuous integration (CI) checks on each push and pull request. Specifically, we run: +We use Github Actions to run continuous integration (CI) checks +on each push and pull request. +Specifically, we run: * [R CMD check](.github/workflows/R-CMD-check.yaml) on multiple platforms and * [covr test coverage](.github/workflows/covr.yaml). diff --git a/RcppTskit/.Rbuildignore b/RcppTskit/.Rbuildignore index b042ca7..3d4c74b 100644 --- a/RcppTskit/.Rbuildignore +++ b/RcppTskit/.Rbuildignore @@ -9,8 +9,12 @@ ^\.clang-format$ ^\.covrignore$ ^\.github$ -^pkg_dev_notes\.md$ ^codecov\.yaml$ -^notes_pkg_dev\.Rmd$ ^inst/examples/create_test\.trees\.R$ ^inst/examples/create_test\.trees\.py$ +^notes_pkg_dev\.Rmd$ +^pkg_dev_notes\.md$ +^tests/testthat/_snaps$ +^vignettes/\.quarto$ +^vignettes/*_files$ +^\.\.$ diff --git a/RcppTskit/DESCRIPTION b/RcppTskit/DESCRIPTION index f0f0f72..18e10ed 100644 --- a/RcppTskit/DESCRIPTION +++ b/RcppTskit/DESCRIPTION @@ -1,30 +1,44 @@ -Package: RcppTskit Type: Package +Package: RcppTskit Title: R access to the tskit C API Version: 0.1.0 Date: 2026-01-2607 Authors@R: - person(given = "Gregor", family = "Gorjanc", - email = "gregor.gorjanc@gmail.com", - role = c("aut", "cre"), - comment = c(ORCID = "0000-0001-8008-2787")) + person("Gregor", "Gorjanc", , "gregor.gorjanc@gmail.com", role = c("aut", "cre"), + comment = c(ORCID = "0000-0001-8008-2787")) Description: Tskit enables performant storage, manipulation, and analysis - of ancestral recombination graphs (ARGs) using succinct tree sequence encoding. - See https://tskit.dev for project news, documentation, and tutorials. - Tskit provides Python, C, and Rust APIs. The Python API can be called from R - via the `reticulate` R package to seamlessly load and analyse - a tree sequence as described at https://tskit.dev/tutorials/RcppTskit.html. - `RcppTskit` provides R access to the `tskit` C API for use cases where the - `reticulate` approach is not optimal. For example, for high-performance and - low-level work with tree sequences. Currently, `RcppTskit` provides a very - limited number of R functions due to the availability of extensive Python API - and the `reticulate` approach. -URL: https://github.com/highlanderlab/RcppTskit + of ancestral recombination graphs (ARGs) using succinct tree sequence + encoding. See https://tskit.dev for project news, documentation, and + tutorials. Tskit provides Python, C, and Rust APIs. The Python API + can be called from R via the `reticulate` R package to seamlessly load + and analyse a tree sequence as described at + https://tskit.dev/tutorials/RcppTskit.html. `RcppTskit` provides R + access to the `tskit` C API for use cases where the `reticulate` + option is not optimal. For example, for high-performance and low-level + work with tree sequences. Currently, `RcppTskit` provides a limited + number of R functions due to the availability of extensive Python API + and the `reticulate` option. License: MIT + file LICENSE -Depends: R (>= 4.0.0) -Imports: methods, R6, Rcpp (>= 1.1.0), reticulate -LinkingTo: Rcpp -Suggests: covr, spelling, testthat -RoxygenNote: 7.3.3 +URL: https://github.com/HighlanderLab/RcppTskit +BugReports: https://github.com/HighlanderLab/RcppTskit/issues +Depends: + R (>= 4.0.0) +Imports: + methods, + R6, + Rcpp (>= 1.1.0), + reticulate +Suggests: + covr, + knitr, + quarto, + spelling, + testthat (>= 3.0.0) +LinkingTo: + Rcpp +VignetteBuilder: + quarto +Config/testthat/edition: 3 Encoding: UTF-8 Language: en-GB +RoxygenNote: 7.3.3 diff --git a/RcppTskit/NEWS.md b/RcppTskit/NEWS.md index 63fc6bd..3f7bea6 100644 --- a/RcppTskit/NEWS.md +++ b/RcppTskit/NEWS.md @@ -13,8 +13,11 @@ This is the first release. - Initial version of RcppTskit using the tskit C API (version 1.3.0). - TreeSequence R6 class so R code looks Pythonic. - `ts_load()` or `TreeSequence$new()` to load a tree sequence from file into R. -- Methods to summarise a tree sequence and its contents `ts$print()`, `ts$num_nodes()`, etc. +- Methods to summarise a tree sequence and its contents `ts$print()`, + `ts$num_nodes()`, etc. - Method to save a tree sequence to a file `ts$dump()`. -- Method to push tree sequence between R and reticulate Python (TODO). -- Most methods have underlying C++ function that work with a pointer to tree sequence object, - for example, `RcppTskit:::ts_load_ptr()`. +- Method to push tree sequence between R and reticulate Python + `ts$r_to_py()` and `ts_py_to_r()`. +- Most methods have an underlying (unexported) C++ function that works with + a pointer to tree sequence object,for example, `RcppTskit:::ts_load_ptr()`. +- All implemented functionality is documented and demonstrated with a vignette. diff --git a/RcppTskit/R/RcppTskit-package.R b/RcppTskit/R/RcppTskit-package.R index f779234..037512c 100644 --- a/RcppTskit/R/RcppTskit-package.R +++ b/RcppTskit/R/RcppTskit-package.R @@ -8,10 +8,10 @@ #' via the `reticulate` R package to seamlessly load and analyse a tree sequence #' as described at https://tskit.dev/tutorials/tskitr.html. #' `RcppTskit` provides R access to the `tskit` C API for use cases where the -#' `reticulate` approach is not optimal. For example, for high-performance -#' and low-level work with tree sequences. Currently, `RcppTskit` provides a very +#' `reticulate` option is not optimal. For example, for high-performance +#' and low-level work with tree sequences. Currently, `RcppTskit` provides a #' limited number of R functions due to the availability of extensive Python API -#' and the `reticulate` approach. +#' and the `reticulate` option. #' @keywords internal #' #' @useDynLib RcppTskit, .registration = TRUE @@ -21,76 +21,7 @@ #' @importFrom reticulate is_py_object import py_module_available py_require #' #' @examples -#' \dontshow{# Providing the examples here so we test them via R CMD check} -#' # Here are examples showcasing what you can do with RcppTskit -#' -#' # 1) Load a tree sequence into R and summarise it -#' # Load a tree sequence -#' ts_file <- system.file("examples/test.trees", package = "RcppTskit") -#' ts <- ts_load(ts_file) -#' -#' # Print summary of the tree sequence -#' ts$num_individuals() -#' ts -#' -#' # 2) Pass tree sequence between R and reticulate or standard Python -#' -#' # Tree sequence in R -#' ts_file <- system.file("examples/test.trees", package = "RcppTskit") -#' ts <- ts_load(ts_file) -#' -#' # If you have a tree sequence in R and want to use tskit Python API, use -#' ts_py <- ts$r_to_py() -#' # ... continue in reticulate Python ... -#' ts_py$num_individuals # 80 -#' ts2_py = ts_py$simplify(samples = c(0L, 1L, 2L, 3L)) -#' ts2_py$num_individuals # 2 -#' # ... and to bring it back to R use ... -#' ts2 <- ts_py_to_r(ts2_py) -#' ts2$num_individuals() # 2 -#' -#' # If you prefer standard (non-reticulate) Python, use -#' ts_file <- tempfile() -#' print(ts_file) -#' ts$dump(file = ts_file) -#' # ... continue in standard Python ... -#' # import tskit -#' # ts = tskit.load("insert_ts_file_path_here") -#' # ts.num_individuals # 80 -#' # ts2 = ts.simplify(samples = [0, 1, 2, 3]) -#' # ts2.num_individuals # 2 -#' # ts2.dump("insert_ts_file_path_here") -#' # ... and to bring it back to R use ... -#' ts2 <- ts_load(ts_file) -#' ts$num_individuals() # 2 (if you have ran the above Python code) -#' -#' # 3) Call tskit C API in C++ code in R session or script -#' library(Rcpp) -#' # Write and compile a C++ function -#' codeString <- ' -#' #include -#' int ts_num_individuals(SEXP ts) { -#' Rcpp::XPtr ts_xptr(ts); -#' return (int) tsk_treeseq_get_num_individuals(ts_xptr); -#' }' -#' ts_num_individuals2 <- Rcpp::cppFunction(code=codeString, -#' depends="RcppTskit", -#' plugins="RcppTskit") -#' # We must specify both the `depends` and `plugins` arguments! -#' -#' # Load a tree sequence -#' ts_file <- system.file("examples/test.trees", package="RcppTskit") -#' ts <- ts_load(ts_file) -#' -#' # Apply the compiled function -#' ts_num_individuals2(ts$pointer) -#' -#' # An identical RcppTskit implementation -#' ts$num_individuals() -#' -#' # 4) Call `tskit` C API in C++ code in another R package -#' # TODO: Show vignette here -#' # https://github.com/HighlanderLab/RcppTskit/issues/10 +#' vignette(package="RcppTskit") "_PACKAGE" #' Providing an inline plugin so we can call tskit C API with functions like diff --git a/RcppTskit/cleanup.win b/RcppTskit/cleanup.win index 0387251..3f4a8dc 100755 --- a/RcppTskit/cleanup.win +++ b/RcppTskit/cleanup.win @@ -5,4 +5,4 @@ rm -f src/Makevars.win # Removing compilation files -rm -f src/*.o src/tskit/*.o src/*.dll +rm -f src/*.o src/tskit/*.o src/*.dll src/*.dll.a diff --git a/RcppTskit/inst/STATE_and_AIMS.md b/RcppTskit/inst/STATE_and_AIMS.md deleted file mode 100644 index 6eea734..0000000 --- a/RcppTskit/inst/STATE_and_AIMS.md +++ /dev/null @@ -1,347 +0,0 @@ -# STATE of the tree sequence ecosystem and AIMS for `RcppTskit` - -## STATE of the tree sequence ecosystem - -The tree sequence ecosystem is rapidly evolving (see https://tskit.dev). There are now multiple packages supporting generation of tree sequences and there is a growing number of APIs for analysing them. This is a quick summary of the state of the ecosystem as of December 2025. - - * `tskit` (https://tskit.dev/tskit/docs, https://github.com/tskit-dev/tskit) is the core tree sequence toolkit. It has a C API and Python API. Python API is popular entry point for most users. There is also a Rust API that interfaces the C API. - - * `msprime` (https://tskit.dev/msprime/docs, https://github.com/tskit-dev/msprime) generates tree sequences with backward in time simulation. It has Python API and command line interface. - - * `SLiM` (https://messerlab.org/slim, https://github.com/MesserLab/SLiM) generates tree sequences with forward in time simulation. It is written in C++ (with embedded `tskit` C library) and has command line and GUI interface. It's tree sequence recording is described in great detail at https://github.com/MesserLab/SLiM/blob/master/treerec/implementation.md. - - * `pyslim` (https://tskit.dev/pyslim/docs, https://github.com/tskit-dev/pyslim) provides a Python API for reading and modifying `tskit` tree sequence files produced by SLiM, or modifying files produced by other programs (e.g., msprime) for use in SLiM. - - * `fwdpy11` (https://molpopgen.github.io/fwdpy11, https://github.com/molpopgen/fwdpy11) generates tree sequences with forward in time simulation. It has a Python API, which is built on C++ API (`fwdpp`). - - * `stdpopsim` (https://popsim-consortium.github.io/stdpopsim-docs, https://github.com/popsim-consortium/stdpopsim) is a standard library of population genetic models used in simulations with `msprime` and `SLiM`. It has Python API and command line interface. - - * `slendr` (https://bodkan.net/slendr, https://github.com/bodkan/slendr) is an R package for describing population genetic models, simulating them with either `msprime` or `SLiM`, and analysing resulting tree sequences using `tskit`. It calls `msprime`, `pyslim`, and `tskit` via `reticulate` R package (that enables calling Python from within R), while it calls `SLiM` via R `system()/shell()` function. It implemented R wrappers for a number of `tskit` functions, such as `ts_read()`, `ts_write()`, `ts_simplify()`, `ts_mutate()`, etc. The wrappers call Python functions (via `reticulate`) and perform additional actions required for integration with R and `slendr` functionality. - - * `slimr` (https://rdinnager.github.io/slimr, https://github.com/rdinnager/slimr) provides an R API for specifying and running SLiM scripts and analysing results in R. It runs `SLiM` via R package `processx`. - - * `tsinfer` (https://tskit.dev/tsinfer/docs, https://github.com/tskit-dev/tsinfer) generates tree sequence from observed genomic (haplotype) data. It has Python API and command line interface. - -As described above, the tree sequence ecosystem is extensive. Python is the most widely used and complete interface for simulation and analysis of tree sequences. - -There is an interest for work with tree sequences in R. Because we can call Python from within R with `reticulate`, there is no pressing need for a dedicated R support for work with tree sequences. See https://tskit.dev/tutorials/tskitr.html on how this looks like and further details at https://rstudio.github.io/reticulate. In a way, this situation will positively focus the community on the Python collection of packages. While there are differences between R and Python, many R users will be able to follow the extensive `tskit` Python API documentation and examples. For example, the `reticulate` approach of working with tree sequences from within R looks like this: - -TODO: Move reticulate code to an external file (since it's quite long)? #28 - https://github.com/HighlanderLab/RcppTskit/issues/28 - -``` -# install.packages("reticulate") -library(reticulate) -reticulate::py_require("msprime") -# msprime and tskit Python API calls from within R via reticulate -msprime <- reticulate::import("msprime") -ts <- msprime$sim_ancestry(10, sequence_length = 100, recombination_rate = 0.1) -is(ts) -# "tskit.trees.TreeSequence" -class(ts) -# [1] "tskit.trees.TreeSequence" "python.builtin.object" -ts$`__class__` -# -# signature: (ll_tree_sequence) -print(ts) -# -cat(py_str(ts)) -# ╔═══════════════════════════╗ -# ║TreeSequence ║ -# ╠═══════════════╤═══════════╣ -# ║Trees │ 73║ -# ... -cat(py_repr(ts)) -# -lobstr::obj_addr(ts) -# 0x177073e40 -py_id(ts) -# [1] "5861550480" -# --> a complex object exist in reticulate Python -# and is wrapped by an R object in R -# https://rstudio.github.io/reticulate/#type-conversions - -print(ts$num_samples) -# [1] 20 -# --> a simple object (like an integer) is converted to an R object -# https://rstudio.github.io/reticulate/#type-conversions - -ts <- msprime$sim_mutations(ts, rate = 0.1) -ts <- ts$simplify(samples = c(0L, 1L, 2L, 3L)) -G <- ts$genotype_matrix() -str(G) -# int [1:54, 1:4] 0 0 0 0 0 ... -# py_id(G) -# Error: ! Expected a python object, received a integer -# --> a simple object (like a NumPy array) is converted to an R object -# https://rstudio.github.io/reticulate/#type-conversions -# https://rstudio.github.io/reticulate/articles/arrays.html - -# G is an R matrix object so we can use R functions on it -allele_frequency <- rowMeans(G) -allele_frequency -``` - -To provide idiomatic R interface to some population genetic simulation steps and operations with tree sequences, `slendr` implemented bespoke functions and wrapper functions that call Python packages and their functions via `reticulate`. As such, `slendr` further lowers barriers for R users. For example, the `slendr/reticulate` approach of working with tree sequences from within R looks like this: - -TODO: Move slendr/reticulate example to an external file (since it's quite long)? #27 - https://github.com/HighlanderLab/RcppTskit/issues/27 - -``` -# install.packages("slendr") -library(slendr) -# setup_env() # run in the first session -init_env() # run in future sessions -mode_file <- system.file("extdata/models/introgression", package = "slendr") -model <- read_model(path = mode_file) -print(model) -# slendr 'model' object -# --------------------- -# populations: CH, AFR, NEA, EUR -... - -afr <- model$populations[["AFR"]] -eur <- model$populations[["EUR"]] -samples <- schedule_sampling(model, times = 0, list(afr, 10), list(eur, 100)) -ts <- msprime(model, sequence_length = 100, recombination_rate = 0.001, - samples = samples) -is(ts) -# [1] "slendr_ts" -class(ts) -# [1] "slendr_ts" "tskit.trees.TreeSequence" "python.builtin.object" -ts$`__class__` -# -# signature: (ll_tree_sequence) -print(ts) -# ╔═══════════════════════════╗ -# ║TreeSequence ║ -# ╠═══════════════╤═══════════╣ -# ║Trees │ 100║ -# ... -cat(reticulate::py_str(ts)) -# ╔═══════════════════════════╗ -# ║TreeSequence ║ -# ╠═══════════════╤═══════════╣ -# ║Trees │ 100║ -# ... -cat(reticulate::py_repr(ts)) -# -lobstr::obj_addr(ts) -# TODO: show R wrapper object address -reticulate::py_id(ts) -# [1] "5894401040" -# --> a complex object lives in reticulate Python -# and is wrapped by an R object in R -# https://rstudio.github.io/reticulate/#type-conversions - -print(ts$num_samples) -# [1] 220 -# --> a simple object (like an integer) is converted to an R object -# https://rstudio.github.io/reticulate/#type-conversions - -ts <- ts_mutate(ts, mutation_rate = 0.0001) -G <- ts$genotype_matrix() -str(G) -# int [1:100, 1:220] 0 0 2 2 0 2 1 0 0 1 ... -# py_id(G) -# Error: ! Expected a python object, received a integer -# --> a simple object (like a NumPy array) is converted to an R object -# https://rstudio.github.io/reticulate/#type-conversions -# https://rstudio.github.io/reticulate/articles/arrays.html - -# G is an R matrix object so we can use R functions on it -allele_frequency <- rowMeans(G) -allele_frequency - -# An alternative from slendr -G <- ts_genotypes(ts) -# 98 multiallelic sites (98.000% out of 100 total) detected and removed -str(G) -# tibble [2 × 221] (S3: tbl_df/tbl/data.frame) -# $ pos : int [1:2] 40 80 -# $ AFR_1_chr1 : int [1:2] 0 0 - -# G is an R tibble object so we can use R functions on it -G <- as.matrix(G[, -1]) -allele_frequency <- rowMeans(G) -allele_frequency -``` - -One downside of using `reticulate` is the overhead of calling Python functions. This overhead is minimal for most analyses - these call a few functions, which do all the work, looping, etc. on Python side, often calling `tskit`'s C API). However, the overhead can be limiting for repeated calling of `tskit` functions, say for tree sequence recording in an R session/package. Note, that while metadata is available in C and Python APIs, it's encoding/decoding is only available in Python API, though `SLiM` does something of it's own in C++! -TODO: Study what `SLiM` does with metadata #24 - https://github.com/HighlanderLab/RcppTskit/issues/24 - -## AIMS for `RcppTskit` - -Given the above state of the tree sequence ecosystem, the aims of the `RcppTskit` package are to provide an easy to install and use R package that enables users to: - 1) Load a tree sequence into R and summarise it, - 2) Pass tree sequence between R and reticulate or standard Python, - 3) Call `tskit` C API in C++ code in R session or script, and - 4) Call `tskit` C API in C++ code in another R package. - -You can see examples for all of these in `vignette(TODO)` and -for 1-3) also in in `?RcppTskit`. - -TODO: Move all these example codes and outputs into vignette - https://github.com/HighlanderLab/RcppTskit/issues/10 - -``` -# Install RcppTskit -remotes::install_github(repo = "HighlanderLab/RcppTskit/RcppTskit") - -# Load RcppTskit and print tskit version -library(RcppTskit) -tskit_version() -# major minor patch -# 1 3 0 -``` - -### 1) Load a tree sequence into R and summarise it - -Here is an example: - -``` -# Load a tree sequence -ts_file <- system.file("examples/test.trees", package = "RcppTskit") -ts <- ts_load(ts_file) -ts -# -# TODO: Rename ts_load() to ts_load_ptr() and create ts_load() returning -# S3/S4/R6/... object #22 -# https://github.com/HighlanderLab/RcppTskit/issues/22 - -# Print summary of the tree sequence -ts_print(ts) -# $ts -# property value -# 1 num_samples 160 -# 2 sequence_length 10000 -# 3 num_trees 26 -# 4 time_units generations -# 5 has_metadata FALSE -# -# $tables -# table number has_metadata -# 1 provenances 2 NA -# 2 populations 1 TRUE -# 3 migrations 0 FALSE -# ... -``` - -### 2) Pass tree sequence between R and reticulate or standard Python - -Here is an example: - -``` -# Tree sequence in R -ts_file <- system.file("examples/test.trees", package = "RcppTskit") -ts <- ts_load(ts_file) - -# If you have a tree sequence in R and you want to use tskit Python API, -# you can write it to disk and load it into reticulate Python -ts_py <- ts_r_to_py(ts) -# ... continue in reticulate Python ... -ts_py$num_individuals # 160 -ts2_py = ts_py$simplify(samples = c(0L, 1L, 2L, 3L)) -ts2_py$num_individuals # 2 -# ... and to bring it back to R ... -ts2 <- ts_py_to_r(ts2_py) -ts_num_individuals(ts2) # 2 - -# If you prefer standard (non-reticulate) Python, use this -ts_file <- tempfile() -print(ts_file) -ts_dump(ts, file = ts_file) -# ... continue in standard Python ... -# import tskit -# ts = tskit.load("insert_ts_file_path_here") -# ts.num_individuals # 80 -# ts2 = ts.simplify(samples = [0, 1, 2, 3]) -# ts2.num_individuals # 2 -# ts2.dump("insert_ts_file_path_here") -# ... and to bring it back to R ... -ts2 <- ts_load(ts_file) -ts_num_individuals(ts2) # 2 -``` - -### 3) Call `tskit` C API in C++ code in R session or script - -Here is an example: - -``` -# Load packages -library(RcppTskit) -library(Rcpp) - -# Write and compile a C++ function -codeString <- ' - #include - int ts_num_individuals(SEXP ts) { - int n; - Rcpp::XPtr xptr(ts); - n = (int) tsk_treeseq_get_num_individuals(xptr); - return n; - }' -ts_num_individuals2 <- cppFunction(code = codeString, - depends = "RcppTskit", plugins = "RcppTskit") -# We must specify both the `depends` and `plugins` arguments! - -# Load a tree sequence -ts_file <- system.file("examples/test.trees", package = "RcppTskit") -ts <- ts_load(ts_file) - -# Apply the compiled function -ts_num_individuals2(ts) -# [1] 80 - -# An identical RcppTskit implementation -ts_num_individuals(ts) -# [1] 80 -``` - -### 4) Call `tskit` C API in C++ code in another R package - -TODO: Move these details to a vignette!? - https://github.com/HighlanderLab/RcppTskit/issues/10 - -Follow the steps below in your R package. To see details of each step, see the files in R package `AlphaSimR` at this [commit](https://github.com/HighlanderLab/AlphaSimR/commit/12657b08e7054d88bc214413d13f36c7cde60d95) (that has a proof of concept of using `tskit` C API via `RcppTskit`). - -a) Open `DESCRIPTION` file and add `RcppTskit` to the `LinkingTo:` field. - -b) Add `#include ` as needed to your C++ header files in `src` directory. - -c) Call `tskit` C API as needed in your C++ code in `src` directory. - -d) Configure your package build to use the `RcppTskit` library file using the following steps: - - * Add `src/Makevars.in` and `src/Makevars.win.in` files with `PKG_LIB = @RCPPTSKIT_LIB@` flag, in addition to other flags. - - * Add `tools/configure.R` file, which will replace `@RCPPTSKIT_LIB@` in `src/Makevars.in` and `src/Makevars.win.in` files with the installed `RcppTskit` library file, including appropriate flags, and generate `src/Makevars` and `src/Makevars.win`. - - * Add `configure` and `configure.win` scripts (and make them executable) to call `tools/configure.R`. - - * Add `cleanup` and `cleanup.win` scripts (and make them executable) to remove `src/Makevars` and `src/Makevars.win` as well as compilation files. - -e) You should now be ready to build, check, and install your package using tools like `devtools::build()`, `devtools::check()`, and `devtools::install()` or their `R CMD` equivalents. - -Here is an example: - -``` -# Install AlphaSimR -# (Commit with a proof of concept of using tskit C API; -# study the file contents in there! Can also use later commits.) -remotes::install_github( - repo = "HighlanderLab/AlphaSimR", - ref = "12657b08e7054d88bc214413d13f36c7cde60d95" -) - -# Load packages -library(RcppTskit) -library(AlphaSimR) - -# Load tree sequence and count the number of individuals -ts_file <- system.file("examples/test.trees", package = "RcppTskit") -ts <- ts_load(ts_file) -RcppTskit::ts_num_individuals(ts) -AlphaSimR::ts_num_individuals2(ts) -``` diff --git a/RcppTskit/inst/WORDLIST b/RcppTskit/inst/WORDLIST index 3898e4d..c711a40 100644 --- a/RcppTskit/inst/WORDLIST +++ b/RcppTskit/inst/WORDLIST @@ -1,10 +1,12 @@ ARGs ORCID Rcpp +SLiM TreeSequence Tskit etc kastore +msprime num ts tskit diff --git a/RcppTskit/inst/examples/explore_reticulate.R b/RcppTskit/inst/examples/explore_reticulate.R new file mode 100644 index 0000000..76a8e88 --- /dev/null +++ b/RcppTskit/inst/examples/explore_reticulate.R @@ -0,0 +1,50 @@ +# install.packages("reticulate") +library(reticulate) +reticulate::py_require("msprime") +# msprime and tskit Python API calls from within R via reticulate +msprime <- reticulate::import("msprime") +ts <- msprime$sim_ancestry(10, sequence_length = 100, recombination_rate = 0.1) +is(ts) +# "tskit.trees.TreeSequence" +class(ts) +# [1] "tskit.trees.TreeSequence" "python.builtin.object" +ts$`__class__` +# +# signature: (ll_tree_sequence) +print(ts) +# +cat(py_str(ts)) +# ╔═══════════════════════════╗ +# ║TreeSequence ║ +# ╠═══════════════╤═══════════╣ +# ║Trees │ 73║ +# ... +cat(py_repr(ts)) +# +lobstr::obj_addr(ts) +# 0x177073e40 +py_id(ts) +# [1] "5861550480" +# --> a complex object exist in reticulate Python +# and is wrapped by an R object in R +# https://rstudio.github.io/reticulate/#type-conversions + +print(ts$num_samples) +# [1] 20 +# --> a simple object (like an integer) is converted to an R object +# https://rstudio.github.io/reticulate/#type-conversions + +ts <- msprime$sim_mutations(ts, rate = 0.1) +ts <- ts$simplify(samples = c(0L, 1L, 2L, 3L)) +G <- ts$genotype_matrix() +str(G) +# int [1:54, 1:4] 0 0 0 0 0 ... +py_id(G) +# Error: ! Expected a python object, received a integer +# --> a simple object (like a NumPy array) is converted to an R object +# https://rstudio.github.io/reticulate/#type-conversions +# https://rstudio.github.io/reticulate/articles/arrays.html + +# G is an R matrix object so we can use R functions on it +allele_frequency <- rowMeans(G) +allele_frequency diff --git a/RcppTskit/inst/examples/explore_slendr.R b/RcppTskit/inst/examples/explore_slendr.R new file mode 100644 index 0000000..885b040 --- /dev/null +++ b/RcppTskit/inst/examples/explore_slendr.R @@ -0,0 +1,80 @@ +# install.packages("slendr") +library(slendr) +# setup_env() # run in the first session +init_env() # run in future sessions +mode_file <- system.file("extdata/models/introgression", package = "slendr") +model <- read_model(path = mode_file) +print(model) +# slendr 'model' object +# --------------------- +# populations: CH, AFR, NEA, EUR +... + +afr <- model$populations[["AFR"]] +eur <- model$populations[["EUR"]] +samples <- schedule_sampling(model, times = 0, list(afr, 10), list(eur, 100)) +ts <- msprime( + model, + sequence_length = 100, + recombination_rate = 0.001, + samples = samples +) # this will not be instant ... but not too long either +is(ts) +# [1] "slendr_ts" +class(ts) +# [1] "slendr_ts" "tskit.trees.TreeSequence" "python.builtin.object" +ts$`__class__` +# +# signature: (ll_tree_sequence) +print(ts) +# ╔═══════════════════════════╗ +# ║TreeSequence ║ +# ╠═══════════════╤═══════════╣ +# ║Trees │ 100║ +# ... +cat(reticulate::py_str(ts)) +# ╔═══════════════════════════╗ +# ║TreeSequence ║ +# ╠═══════════════╤═══════════╣ +# ║Trees │ 100║ +# ... +cat(reticulate::py_repr(ts)) +# +lobstr::obj_addr(ts) +reticulate::py_id(ts) +# [1] "5894401040" +# --> a complex object lives in reticulate Python +# and is wrapped by an R object in R +# https://rstudio.github.io/reticulate/#type-conversions + +print(ts$num_samples) +# [1] 220 +# --> a simple object (like an integer) is converted to an R object +# https://rstudio.github.io/reticulate/#type-conversions + +ts <- ts_mutate(ts, mutation_rate = 0.0001) +G <- ts$genotype_matrix() +str(G) +# int [1:100, 1:220] 0 0 2 2 0 2 1 0 0 1 ... +reticulate::py_id(G) +# Error: ! Expected a python object, received a integer +# --> a simple object (like a NumPy array) is converted to an R object +# https://rstudio.github.io/reticulate/#type-conversions +# https://rstudio.github.io/reticulate/articles/arrays.html + +# G is an R matrix object so we can use R functions on it +allele_frequency <- rowMeans(G) +allele_frequency + +# An alternative from slendr +G <- ts_genotypes(ts) +# 98 multiallelic sites (98.000% out of 100 total) detected and removed +str(G) +# tibble [2 × 221] (S3: tbl_df/tbl/data.frame) +# $ pos : int [1:2] 40 80 +# $ AFR_1_chr1 : int [1:2] 0 0 + +# G is an R tibble object so we can use R functions on it +G <- as.matrix(G[, -1]) +allele_frequency <- rowMeans(G) +allele_frequency diff --git a/RcppTskit/man/RcppTskit-package.Rd b/RcppTskit/man/RcppTskit-package.Rd index 1df3d00..ece18c5 100644 --- a/RcppTskit/man/RcppTskit-package.Rd +++ b/RcppTskit/man/RcppTskit-package.Rd @@ -13,82 +13,13 @@ Tskit provides Python, C, and Rust APIs. The Python API can be called from R via the `reticulate` R package to seamlessly load and analyse a tree sequence as described at https://tskit.dev/tutorials/tskitr.html. `RcppTskit` provides R access to the `tskit` C API for use cases where the -`reticulate` approach is not optimal. For example, for high-performance -and low-level work with tree sequences. Currently, `RcppTskit` provides a very +`reticulate` option is not optimal. For example, for high-performance +and low-level work with tree sequences. Currently, `RcppTskit` provides a limited number of R functions due to the availability of extensive Python API -and the `reticulate` approach. +and the `reticulate` option. } \examples{ -\dontshow{# Providing the examples here so we test them via R CMD check} -# Here are examples showcasing what you can do with RcppTskit - -# 1) Load a tree sequence into R and summarise it -# Load a tree sequence -ts_file <- system.file("examples/test.trees", package = "RcppTskit") -ts <- ts_load(ts_file) - -# Print summary of the tree sequence -ts$num_individuals() -ts - -# 2) Pass tree sequence between R and reticulate or standard Python - -# Tree sequence in R -ts_file <- system.file("examples/test.trees", package = "RcppTskit") -ts <- ts_load(ts_file) - -# If you have a tree sequence in R and want to use tskit Python API, use -ts_py <- ts$r_to_py() -# ... continue in reticulate Python ... -ts_py$num_individuals # 80 -ts2_py = ts_py$simplify(samples = c(0L, 1L, 2L, 3L)) -ts2_py$num_individuals # 2 -# ... and to bring it back to R use ... -ts2 <- ts_py_to_r(ts2_py) -ts2$num_individuals() # 2 - -# If you prefer standard (non-reticulate) Python, use -ts_file <- tempfile() -print(ts_file) -ts$dump(file = ts_file) -# ... continue in standard Python ... -# import tskit -# ts = tskit.load("insert_ts_file_path_here") -# ts.num_individuals # 80 -# ts2 = ts.simplify(samples = [0, 1, 2, 3]) -# ts2.num_individuals # 2 -# ts2.dump("insert_ts_file_path_here") -# ... and to bring it back to R use ... -ts2 <- ts_load(ts_file) -ts$num_individuals() # 2 (if you have ran the above Python code) - -# 3) Call tskit C API in C++ code in R session or script -library(Rcpp) -# Write and compile a C++ function -codeString <- ' - #include - int ts_num_individuals(SEXP ts) { - Rcpp::XPtr ts_xptr(ts); - return (int) tsk_treeseq_get_num_individuals(ts_xptr); - }' -ts_num_individuals2 <- Rcpp::cppFunction(code=codeString, - depends="RcppTskit", - plugins="RcppTskit") -# We must specify both the `depends` and `plugins` arguments! - -# Load a tree sequence -ts_file <- system.file("examples/test.trees", package="RcppTskit") -ts <- ts_load(ts_file) - -# Apply the compiled function -ts_num_individuals2(ts$pointer) - -# An identical RcppTskit implementation -ts$num_individuals() - -# 4) Call `tskit` C API in C++ code in another R package -# TODO: Show vignette here -# https://github.com/HighlanderLab/RcppTskit/issues/10 +vignette(package="RcppTskit") } \seealso{ Useful links: diff --git a/RcppTskit/notes_pkg_dev.Rmd b/RcppTskit/notes_pkg_dev.Rmd index 4c3ef36..cc73e1b 100644 --- a/RcppTskit/notes_pkg_dev.Rmd +++ b/RcppTskit/notes_pkg_dev.Rmd @@ -2,8 +2,20 @@ ## Next TODOs -* TODO: Add R package badges (build status, CRAN version, etc.) to README.md #1 - https://github.com/HighlanderLab/RcppTskit/issues/1 +# Release (TODO) +# TODO: Tag a release #15 +# https://github.com/HighlanderLab/RcppTskit/issues/15 +# remotes::install_github("HighlanderLab/RcppTskit/RcppTskit") + +# Main branch +remotes::install_github("HighlanderLab/RcppTskit/RcppTskit") + +# TODO: Create a devel branch #16 +# https://github.com/HighlanderLab/RcppTskit/issues/16 +# remotes::install_github("HighlanderLab/RcppTskit/RcppTskit@devel") + +# TODO: Publish on CRAN #14 +# https://github.com/HighlanderLab/RcppTskit/issues/14 * This PR nicely shows how slendr is dealing with ts objects (it saves various ts information as attributes) diff --git a/RcppTskit/tests/testthat.R b/RcppTskit/tests/testthat.R index 05dcc2d..d2532cf 100644 --- a/RcppTskit/tests/testthat.R +++ b/RcppTskit/tests/testthat.R @@ -1,3 +1,11 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + library(testthat) library(RcppTskit) diff --git a/RcppTskit/vignettes/RcppTskit_intro.qmd b/RcppTskit/vignettes/RcppTskit_intro.qmd new file mode 100644 index 0000000..54df229 --- /dev/null +++ b/RcppTskit/vignettes/RcppTskit_intro.qmd @@ -0,0 +1,351 @@ +--- +title: "RcppTskit: Working with tree sequences in R" +author: "Gregor Gorjanc" +date: today +bibliography: references.bib +vignette: > + %\VignetteIndexEntry{RcppTskit: Working with tree sequences in R} + %\VignetteEngine{quarto::html} + %\VignetteEncoding{UTF-8} +knitr: + opts_chunk: + collapse: true + comment: '#>' +--- + +## Introduction + +The aim of this vignette is to introduce you to +working with tree sequences in R using `RcppTskit` package, +which provides R access to the `tskit` C API +[@jeffrey2026population] (https://tskit.dev/tskit/docs/stable/c-api.html). +If you are new to tree sequences and +the more general concept of ancestral recombination graphs (ARGs), +see @brandt2024promise, @lewanski2024era, @nielsen2024inference, and @wong2024general. +Before diving into how we can use `RcppTskit`, +we summarise the state of the tree sequence ecosystem. +Namely, there is now a large ecosystem of tools to work with tree sequence in various ways, +and the existence of these tools has been shaped the design of `RcppTskit`. +Then we state the aims for `RcppTskit`, +describe the implemented data and class model, and +show how we can use `RcppTskit` in four typical cases. + +As summarised below, +Python is the most widely used environment to work with tree sequences. +We are of the view that most R users can and should leverage +this large ecosystem of Python packages, in particular the `tskit` Python API +[@jeffrey2026population] (https://tskit.dev/tskit/docs/stable/python-api.html), +via the R package `reticulate` [@ushey2025reticulate] (https://rstudio.github.io/reticulate). +With this in mind, +`RcppTskit` provides R access to the `tskit` C API [@jeffrey2026population] +for use cases where the `reticulate` option is not optimal. +For example, for high-performance and low-level work with tree sequences. +As such, `RcppTskit` currently provides a limited number of R functions +due to the availability of extensive Python API and the `reticulate` option. +As the name suggest, `RcppTskit` leverages the R package `Rcpp` +[@eddelbuettel2026rcpp] (https://www.rcpp.org), +which significantly lowers the barrier of using C++ in R. +However, we still have to write wrapper C++ functions and expose them to R, +hence our recommendation to use `reticulate` in the first instance. + +## State of the tree sequence ecosystem + +The tree sequence ecosystem is rapidly evolving. +The website https://tskit.dev/software lists tools that closely interoperate with `tskit`, +while @jeffrey2026population lists several other tools that depend on `tskit` functionality. +Thence, there are now multiple tools for the generation and analysis of tree sequences. +Below is a quick summary of some of the tools relevant to `RcppTskit` as of January 2026. + +- `tskit` (https://tskit.dev/tskit/docs, https://github.com/tskit-dev/tskit) + is the core toolkit for working with tree sequences. + It has an efficient C API and user-friendly Python API. + The python API is a popular entry point for most users and + expands the C API in certain aspects (for example metadata encoding/decoding). + There is also a Rust API that wraps the C API. + +- `msprime` (https://tskit.dev/msprime/docs, https://github.com/tskit-dev/msprime) + generates tree sequences with backward in time simulation. + It has a Python API and command line interface. + +- `SLiM` (https://messerlab.org/slim, https://github.com/MesserLab/SLiM) + generates tree sequences with forward in time simulation. + It is written in C++ (with embedded `tskit` C library) and + has a command line and a GUI interface. + Its tree sequence recording is described in detail at + https://github.com/MesserLab/SLiM/blob/master/treerec/implementation.md. + +- `pyslim` (https://tskit.dev/pyslim/docs, https://github.com/tskit-dev/pyslim) + provides a Python API for reading and modifying `tskit` tree sequence files from SLiM, + or modifying files produced by other programs (e.g., msprime) for use in SLiM. + +- `fwdpy11` (https://molpopgen.github.io/fwdpy11, https://github.com/molpopgen/fwdpy11) + generates tree sequences with forward in time simulation. + It has a Python API, which is built on a C++ API (`fwdpp`). + +- `stdpopsim` (https://popsim-consortium.github.io/stdpopsim-docs, https://github.com/popsim-consortium/stdpopsim) + is a standard library of population genetic models used in simulations with `msprime` and `SLiM`. + It has a Python API and command line interface. + +- `slendr` (https://bodkan.net/slendr, https://github.com/bodkan/slendr) + is an R package for describing population genetic models, + simulating them with either `msprime` or `SLiM`, and + analysing resulting tree sequences using `tskit`. + +- `slimr` (https://rdinnager.github.io/slimr, https://github.com/rdinnager/slimr) + provides an R API for specifying and running SLiM scripts and analysing results in R. + It runs `SLiM` via the R package `processx`. + +The above tools enable work with tree sequences and/or generate them via simulation. +There is a growing list of tools that generate (estimate/infer) ARGs from observed genomic data +and can export it in the tree sequence file format. +Notable mentions are: +`tsinfer` (https://tskit.dev/tsinfer/docs, https://github.com/tskit-dev/tsinfer), +`Relate` (https://myersgroup.github.io/relate, https://github.com/MyersGroup/relate), +`SINGER` (https://github.com/popgenmethods/SINGER), +`ARGNeedle` (https://palamaralab.github.io/software/argneedle, https://github.com/PalamaraLab/arg-needle-lib), and +`Threads` (https://palamaralab.github.io/software/threads, https://github.com/palamaraLab/threads). + +As described above, the tree sequence ecosystem is extensive. +Python is the most widely used platform to interact with tree sequences +with comprehensive packages for simulation and analysis. + +There is interest in working with tree sequences in R. +Because we can call Python from within R using the `reticulate` R package, +there is no pressing need for a dedicated R support for work with tree sequences. +See https://tskit.dev/tutorials/tskitr.html on how this option looks like. +In a way, this situation will positively focus the community on the Python collection of packages. +While there are differences between Python and R, +many R users should be able to follow +the extensive Python API documentation, examples, and tutorials listed above, +in particular those at https://tskit.dev/tutorials. + +To provide idiomatic R interface to some population genetic simulation steps and operations with tree sequences, +`slendr` implemented bespoke functions and wrapper functions to interact with `msprime`, `SLiM`, and `tskit`. +It uses `reticulate` for the interaction with Python APIs of these packages. +As such, `slendr` further lowers barriers for R users to work with tree sequences. + +One downside of using `reticulate` is the overhead of calling Python functions. +This overhead is minimal for most analyses because a user would call a few Python functions, +which do all the work, looping, etc. on the Python side, +often calling the `tskit`'s C API. +However, the overhead can be limiting for repeated calling of `tskit` functions, +say for tree sequence recording in an R session or package. + +## Aims for `RcppTskit` + +Given the above state of the tree sequence ecosystem, +the aims of the `RcppTskit` package are to provide an easy to install and use R package +that supports users in four typical cases. +The authors are open to expanding the scope of `RcppTskit` depending on the demand +and engagement of the users. +The four typical cases are: + +1. Load a tree sequence into R and summarise it, + +2. Pass a tree sequence between R and reticulate or standard Python, + +3. Call `tskit` C API in C++ code in an R session or script, and + +4. Call `tskit` C API in C++ code in another R package. + +Examples for all of these cases are provided in the below sections +after we describe the implemented data and class model. + +## Data and class model + +`RcppTskit` represents a tree sequence as a lightweight R object of R6 class `TreeSequence`. +R6 class was partially chosen so the R code calls resemble Python code. +`TreeSequence` wraps an external pointer (`externalptr`) to the `tskit` C data structure (`tsk_treeseq_t`). +Most methods (for example, `ts$num_individuals()`, `ts$dump()`, etc.) call the `tskit` C API via `Rcpp`, +so the calls are fast and the object is not copied unless you explicitly write/read or change it. +The underlying pointer is exposed as `TreeSequence$pointer` for developers and advanced users +that can write C++ code. + +## For typical use cases + +First install `RcppTskit` from CRAN and load it. + +```{r} +#| label: pre-setup +#| include: false +# Had issues on Windows (reporting that RcppTskit is not installed, which is odd). +# This tries to get library path "back". +lib_env <- Sys.getenv("R_LIBS") +if (nzchar(lib_env)) { + .libPaths(unique(c(strsplit(lib_env, .Platform$path.sep)[[1]], .libPaths()))) +} +``` + +```{r} +#| label: setup +# install.packages("RcppTskit") + +test <- require(RcppTskit) +if (!test) { + message("RcppTskit not available; skipping vignette execution.") + knitr::opts_chunk$set(eval = FALSE) +} +``` + +### 1) Load a tree sequence into R and summarise it + +```{r} +#| label: use_case_1 +# Load a tree sequence +ts_file <- system.file("examples/test.trees", package = "RcppTskit") +ts <- ts_load(ts_file) +is(ts) + +# Print the summary of the tree sequence +ts$print() +# ts # the same as above + +ts$num_individuals() + +# Explore the help pages +help(package = "RcppTskit") +``` + +### 2) Pass tree sequence between R and reticulate or standard Python + +```{r} +#| label: use_case_2 +# Tree sequence in R +ts_file <- system.file("examples/test.trees", package = "RcppTskit") +ts <- ts_load(ts_file) + +# If you now want to use tskit Python API in reticulate Python, use +ts_py <- ts$r_to_py() +# ... continue in reticulate Python ... +ts_py$num_individuals # 80 +ts2_py = ts_py$simplify(samples = c(0L, 1L, 2L, 3L)) +ts2_py$num_individuals # 2 +# ... and to bring it back to R, use ... +ts2 <- ts_py_to_r(ts2_py) +ts2$num_individuals() # 2 + +# If you prefer standard (non-reticulate) Python, use +ts_file <- tempfile() +print(ts_file) +ts$dump(file = ts_file) +# ... continue in standard Python ... +# import tskit +# ts = tskit.load("insert_ts_file_path_here") +# ts.num_individuals # 80 +# ts2 = ts.simplify(samples = [0, 1, 2, 3]) +# ts2.num_individuals # 2 +# ts2.dump("insert_ts_file_path_here") +# ... and to bring it back to R, use ... +ts2 <- ts_load(ts_file) +ts$num_individuals() # 2 (if you have ran the above Python code) +``` + +### 3) Call `tskit` C API in C++ code in an R session or script + +```{r} +#| label: use_case_3 +# Write a C++ function as multi-line character string +codeString <- ' + #include + int ts_num_individuals(SEXP ts) { + Rcpp::XPtr ts_xptr(ts); + return (int) tsk_treeseq_get_num_individuals(ts_xptr); + }' + +# Compile the C++ function +ts_num_individuals2 <- Rcpp::cppFunction( + code = codeString, + depends = "RcppTskit", + plugins = "RcppTskit" +) +# We must specify both the `depends` and `plugins` arguments! + +# Load a tree sequence +ts_file <- system.file("examples/test.trees", package = "RcppTskit") +ts <- ts_load(ts_file) + +# Apply the compiled function (on the pointer) +ts_num_individuals2(ts$pointer) + +# An identical RcppTskit implementation (available as the method of the TreeSequence class) +ts$num_individuals() +``` + +### 4) Call `tskit` C API in C++ code in another R package + +To call the `tskit` C API in your own R package via `Rcpp` you can leverage `RcppTskit`. +Just follow the steps below. +To see details of each step, see the files in the R package `AlphaSimR` +at this commit (proof of concept of using `tskit` C API via `tskitr`[^1]): +https://github.com/HighlanderLab/AlphaSimR/commit/12657b08e7054d88bc214413d13f36c7cde60d95 +(with time this implementation might require changes). + +[^1]: At that time we named the package as `tskitr`. Simply replace `tskitr` with `RcppTskit`, +respecting the lower and upper case depending on the file. + +a) Open `DESCRIPTION` file and add `RcppTskit` to the `LinkingTo:` field. + +b) Add `#include ` as needed to your C++ header files in `src` directory. + +c) Call `tskit` C API as needed in your C++ code in `src` directory (see examples in `RcppTskit`). + +d) Configure your package build to use the `RcppTskit` library file using the following steps: + + - Add `src/Makevars.in` and `src/Makevars.win.in` files with `PKG_LIB = @RCPPTSKIT_LIB@` flag, + in addition to other flags. + + - Add `tools/configure.R` file, + which will replace `@RCPPTSKIT_LIB@` in `src/Makevars.in` and `src/Makevars.win.in` files + with the installed `RcppTskit` library file (including appropriate flags), and + generate `src/Makevars` and `src/Makevars.win`. + + - Add `configure` and `configure.win` scripts (and make them executable) + to call `tools/configure.R`. + + - Add `cleanup` and `cleanup.win` scripts (and make them executable) + to remove `src/Makevars` and `src/Makevars.win` as well as compilation files. + +e) You should now be ready to build, check, and install your package using + `devtools::build()`, `devtools::check()`, and `devtools::install()` or their `R CMD` equivalents. + +```{r} +#| label: use_case_4 +#| eval: false +#| echo: false + +# This code will not work atm because we used tskitr! +# TODO: Create a minimal package to demonstrate how to link against RcppTskit and call tskit C API + +# Install AlphaSimR +# (Commit with a proof of concept of using tskit C API; +# study the file contents in there! Can also use later commits.) +remotes::install_github( + repo = "HighlanderLab/AlphaSimR", + ref = "12657b08e7054d88bc214413d13f36c7cde60d95" +) + +# Load packages +library(RcppTskit) +library(AlphaSimR) + +# Load tree sequence and count the number of individuals +ts_file <- system.file("examples/test.trees", package = "RcppTskit") +ts <- ts_load(ts_file) +RcppTskit::ts_num_individuals(ts) +AlphaSimR::ts_num_individuals2(ts) +``` + +## Conclusion + +`RcppTskit` provides R access to `tskit` C API with simple installation and downstream use. +It provides a limited number of R functions because most users can and should use +the `reticulate` option to call `tskit` Python API. +When this option is not optimal, developers and advanced users can call +the `tskit` C API` via `Rcpp`. + +## Session information + +```{r} +#| label: session-info +sessionInfo() +``` diff --git a/RcppTskit/vignettes/references.bib b/RcppTskit/vignettes/references.bib new file mode 100644 index 0000000..f39c3e2 --- /dev/null +++ b/RcppTskit/vignettes/references.bib @@ -0,0 +1,142 @@ + +@article{brandt2024promise, + title = {The Promise of Inferring the Past using the {Ancestral Recombination Graph (ARG)}}, + author = {Brandt, D{\'e}bora Y C and + Huber, Christian D and + Chiang, Charleston W K and + Ortega-Del Vecchyo, Diego}, + journal = {Genome Biology and Evolution}, + year = {2024}, + pages = {evae005}, + doi = {10.1093/gbe/evae005} +} + +@manual{eddelbuettel2026rcpp, + title = {Rcpp: Seamless R and C++ Integration}, + author = {Eddelbuettel, Dirk and + Francois, Romain and + Allaire, JJ and + Ushey, Kevin and + Kou, Qiang and + Russell, Nathan and + Ucar, Iñaki and + Bates, Doug and + Chambers, John}, + year = {2026}, + note = {R package version 1.1.1}, + doi = {10.32614/CRAN.package.Rcpp}, + url = {https://CRAN.R-project.org/package=Rcpp} +} + +@article{jeffrey2026population, + title = {Population-scale Ancestral Recombination Graphs with tskit 1.0}, + author = {Jeffery, Ben and + Wong, Yan and + Thornton, Kevin and + Tsambos, Georgia and + Bisschop, Gertjan and + Deng, Yun and + Ellerman, E Castedo and + Forest, Thomas and + Fritze, Halley and + Goldstein, Daniel and + Gorjanc, Gregor and + Gower, Graham and + Gravel, Simon and + Guez, Jeremy and + Haller, Benjamin C. and + Kern, Andrew D. and + Kirk, Lloyd and + Krukov, Ivan and + Lee, Hanbin and + Lehmann, Brieuc and + Loay, Hossameldin and + Osmond, Matthew M and + Palmer, Duncan S and + Pope, Nathaniel S and + Ragsdale, Aaron and + Robertson, Duncan and + Rodrigues, Murillo F and + Weiss, Clemens and + Wohns, Anthony Wilder and + Zhan, Shing H and + Zhang, Brian and + van Kemenade, Hugo and + Aspbury, Maz and + Baya, Nik and + Belsare, Saurabh and + Biddanda, Arjun and + Campuzano, Curro and + Gladstein, Ariella and + Guo, Bing and + Karthikeyan, Savita and + Kretzschumar, Warren W and + Rebollo, Inés and + Saunack, Kumar and + Shemirani, Ruhollah and + Simon, Alexis and + Smith, Chris and + Sukumaran, Jeet and + Terhorst, Jonathan and + Unneberg, Per and + Zhang, Ao and + Ralph, Peter + Kelleher, Jerome}, + journal = {TODO}, + year = {2026} +} + +@article{lewanski2024era, + title = {The era of the {ARG}: An introduction to ancestral recombination graphs and their significance in empirical evolutionary genomics}, + author = {Lewanski, Alexander L. and + Grundler, Michael C. and + Bradburd, Gideon S.}, + journal = {PLOS Genetics}, + year = {2024}, + volume = {20}, + number = {1}, + pages = {1--24}, + doi = {10.1371/journal.pgen.1011110}, + url = {https://doi.org/10.1371/journal.pgen.1011110} +} + +@article{nielsen2024inference, + title = {Inference and applications of ancestral recombination graphs}, + author = {Nielsen, Rasmus and + Vaughn, Andrew H. and + Deng, Yun}, + journal = {Nature Reviews Genetics}, + year = {2024}, + volume = {26}, + pages = {47--58}, + doi = {10.1038/s41576-024-00772-4}, + url = {https://doi.org/10.1038/s41576-024-00772-4} +} + +@manual{ushey2025reticulate, + title = {reticulate: Interface to {Python}}, + author = {Ushey, Kevin and + Allaire, JJ and + Tang, Yuan}, + year = {2025}, + note = {R package version 1.44.1}, + doi = {10.32614/CRAN.package.reticulate}, + url = {https://CRAN.R-project.org/package=reticulate} +} + +@article{wong2024general, + title = {A general and efficient representation of ancestral recombination graphs}, + author = {Wong, Yan and + Ignatieva, Anastasia and + Koskela, Jere and + Gorjanc, Gregor and + Wohns, Anthony W and + Kelleher, Jerome}, + journal = {Genetics}, + volume = {228}, + number = {1}, + pages = {iyae100}, + year = {2024}, + month = {07}, + doi = {10.1093/genetics/iyae100} +}