[med-svn] [r-cran-sourcetools] 03/05: New upstream version 0.1.5

Andreas Tille tille at debian.org
Thu Oct 12 16:46:20 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository r-cran-sourcetools.

commit c56394c72ca6d9a85796110c6092cce62a7f8120
Author: Andreas Tille <tille at debian.org>
Date:   Thu Oct 12 18:43:52 2017 +0200

    New upstream version 0.1.5
---
 DESCRIPTION                                        |  21 +
 LICENSE                                            |   2 +
 MD5                                                |  48 ++
 NAMESPACE                                          |  11 +
 NEWS.md                                            |  36 ++
 R/sourcetools.R                                    |  98 ++++
 R/util.R                                           |  57 +++
 README.md                                          |  81 ++++
 debian/README.test                                 |   9 -
 debian/changelog                                   |   5 -
 debian/compat                                      |   1 -
 debian/control                                     |  25 -
 debian/copyright                                   |  33 --
 debian/docs                                        |   3 -
 debian/rules                                       |   5 -
 debian/source/format                               |   1 -
 debian/tests/control                               |   3 -
 debian/tests/run-unit-test                         |  13 -
 debian/watch                                       |   2 -
 inst/include/sourcetools.h                         |  12 +
 inst/include/sourcetools/collection/Position.h     |  77 +++
 inst/include/sourcetools/collection/Range.h        |  34 ++
 inst/include/sourcetools/collection/collection.h   |   7 +
 inst/include/sourcetools/core/core.h               |   7 +
 inst/include/sourcetools/core/macros.h             |  72 +++
 inst/include/sourcetools/core/util.h               | 142 ++++++
 inst/include/sourcetools/cursor/TextCursor.h       |  66 +++
 inst/include/sourcetools/cursor/TokenCursor.h      | 321 +++++++++++++
 inst/include/sourcetools/cursor/cursor.h           |   7 +
 inst/include/sourcetools/multibyte/multibyte.h     |  41 ++
 inst/include/sourcetools/platform/platform.h       |  20 +
 inst/include/sourcetools/r/RCallRecurser.h         |  75 +++
 inst/include/sourcetools/r/RConverter.h            |  39 ++
 inst/include/sourcetools/r/RFunctions.h            |  85 ++++
 inst/include/sourcetools/r/RHeaders.h              |   8 +
 .../include/sourcetools/r/RNonStandardEvaluation.h | 149 ++++++
 inst/include/sourcetools/r/RUtils.h                | 100 ++++
 inst/include/sourcetools/r/r.h                     |  11 +
 inst/include/sourcetools/read/MemoryMappedReader.h | 139 ++++++
 .../sourcetools/read/posix/FileConnection.h        |  58 +++
 .../read/posix/MemoryMappedConnection.h            |  55 +++
 inst/include/sourcetools/read/read.h               |  24 +
 .../sourcetools/read/windows/FileConnection.h      |  50 ++
 .../read/windows/MemoryMappedConnection.h          |  51 ++
 inst/include/sourcetools/tests/testthat.h          |  14 +
 .../sourcetools/tokenization/Registration.h        | 190 ++++++++
 inst/include/sourcetools/tokenization/Token.h      | 522 +++++++++++++++++++++
 inst/include/sourcetools/tokenization/Tokenizer.h  | 463 ++++++++++++++++++
 .../sourcetools/tokenization/tokenization.h        |   8 +
 inst/include/sourcetools/utf8/utf8.h               | 115 +++++
 man/read.Rd                                        |  25 +
 man/tokenize-methods.Rd                            |  42 ++
 src/Makevars                                       |   1 +
 src/Makevars.win                                   |   1 +
 src/Reader.cpp                                     |  88 ++++
 src/Tokenizer.cpp                                  |  96 ++++
 tests/testthat.R                                   |   4 +
 tests/testthat/helper-utf8.R                       |   3 +
 tests/testthat/test-read.R                         |  30 ++
 tests/testthat/test-tokenize.R                     | 165 +++++++
 60 files changed, 3771 insertions(+), 100 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..24bd541
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: sourcetools
+Type: Package
+Title: Tools for Reading, Tokenizing and Parsing R Code
+Version: 0.1.5
+Author: Kevin Ushey
+Maintainer: Kevin Ushey <kevinushey at gmail.com>
+Description: Tools for the reading and tokenization of R code. The
+    'sourcetools' package provides both an R and C++ interface for the tokenization
+    of R code, and helpers for interacting with the tokenized representation of R
+    code.
+License: MIT + file LICENSE
+LazyData: TRUE
+Depends: R (>= 3.0.2)
+Suggests: testthat
+RoxygenNote: 5.0.1
+BugReports: https://github.com/kevinushey/sourcetools/issues
+Encoding: UTF-8
+NeedsCompilation: yes
+Packaged: 2016-09-14 22:38:37 UTC; kevin
+Repository: CRAN
+Date/Publication: 2016-09-15 03:07:07
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..a7bc902
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2015-2016
+COPYRIGHT HOLDER: Kevin Ushey
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..56e91d2
--- /dev/null
+++ b/MD5
@@ -0,0 +1,48 @@
+f46bb7c8e02f465638a7a3f70bcbb76e *DESCRIPTION
+472904db5a93a07692b3fe24cabcf96c *LICENSE
+d904b6ab89c989c9148d4efba103d02d *NAMESPACE
+f236d8fcec934db9ad39b317d5308bd7 *NEWS.md
+1782b737100f74e11e90c8c45db0d509 *R/sourcetools.R
+b09019840734b467d388e34905ebc46c *R/util.R
+d82a27087d6f3fac9d06978a31640aed *README.md
+c5215eb66349006d73ad7e65ce54046b *inst/include/sourcetools.h
+f071c0148a4629ea8150b63d2627cf0c *inst/include/sourcetools/collection/Position.h
+019b4182b5d6b9700562f8d3b90ac1f4 *inst/include/sourcetools/collection/Range.h
+d11b4138653828a197304b1cb692c614 *inst/include/sourcetools/collection/collection.h
+d77f935ab3b7da52405a501f2404d18e *inst/include/sourcetools/core/core.h
+9ad041bb1ce4251ab7afccee1ac6de3d *inst/include/sourcetools/core/macros.h
+78bfe50df612b63dab8688d7bd8c8bbd *inst/include/sourcetools/core/util.h
+353bc7e4e2bf9b62e301243a8a631e52 *inst/include/sourcetools/cursor/TextCursor.h
+81d685fb305ce649d7d159d003b4b1b3 *inst/include/sourcetools/cursor/TokenCursor.h
+42d2f27e9ae85211cd5542eaecc7c37b *inst/include/sourcetools/cursor/cursor.h
+973781254e2b3ae94eb0770554efae33 *inst/include/sourcetools/multibyte/multibyte.h
+63818672820bc3a620d09e67c68af7c2 *inst/include/sourcetools/platform/platform.h
+f10cddd374f1c671456a4120496a4291 *inst/include/sourcetools/r/RCallRecurser.h
+78a60ba9c51951eb8b4211a0bd8bc998 *inst/include/sourcetools/r/RConverter.h
+5d890077972c2a4eb07c98aafc63690c *inst/include/sourcetools/r/RFunctions.h
+eab3a99f83f6bfa7b7cd0b7e8072edaa *inst/include/sourcetools/r/RHeaders.h
+dc5e82ab54673bd892e6b420896f101b *inst/include/sourcetools/r/RNonStandardEvaluation.h
+1e2951a40e7692881c7ab4645796a9ee *inst/include/sourcetools/r/RUtils.h
+da17972b93e9b4e91554f705b4cda985 *inst/include/sourcetools/r/r.h
+8ea2ac860e800d20143be7792483ba7c *inst/include/sourcetools/read/MemoryMappedReader.h
+8096eb102d9679287cc8fa705d3a21da *inst/include/sourcetools/read/posix/FileConnection.h
+e0147869348d196193c13e89762670f0 *inst/include/sourcetools/read/posix/MemoryMappedConnection.h
+44abbae26e9ab704c92a9ce6c98ba33a *inst/include/sourcetools/read/read.h
+e77d3eb6a47db7e6d1e65f59eb3ab2c5 *inst/include/sourcetools/read/windows/FileConnection.h
+f4b06a29aca570063567f8a765609056 *inst/include/sourcetools/read/windows/MemoryMappedConnection.h
+b80180f53ca809b9ba6d4e7df6316e0b *inst/include/sourcetools/tests/testthat.h
+c6afafc697b747111348dabb88d9fb4a *inst/include/sourcetools/tokenization/Registration.h
+9ab2cf85d30171f4744d21d10c6cd532 *inst/include/sourcetools/tokenization/Token.h
+6599c5daf2f3e59861153982cc00efef *inst/include/sourcetools/tokenization/Tokenizer.h
+00c91c6e20bc534fa3a689c79770f46c *inst/include/sourcetools/tokenization/tokenization.h
+1491ededa24449d40554757c96bebaf0 *inst/include/sourcetools/utf8/utf8.h
+3005e918c6f7dbf54993a04b74ca9e54 *man/read.Rd
+a94108446e930c7c488c695e1618f049 *man/tokenize-methods.Rd
+3f03da795dd26373156bddc78d41e95d *src/Makevars
+3f03da795dd26373156bddc78d41e95d *src/Makevars.win
+43927b22e2812e5ba35419390a7933ae *src/Reader.cpp
+78946eee022f534743af918d651dd244 *src/Tokenizer.cpp
+175dc27564828d1abeb87bc004d02266 *tests/testthat.R
+b6ba9001993894a2085c981a6c58018d *tests/testthat/helper-utf8.R
+0e31fb15ea8b66d310162f60c434ed7d *tests/testthat/test-read.R
+23eb599dfa50f8915e1448b88e1719a4 *tests/testthat/test-tokenize.R
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..8a6b65c
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,11 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(print,RTokens)
+export(read)
+export(read_bytes)
+export(read_lines)
+export(read_lines_bytes)
+export(tokenize)
+export(tokenize_file)
+export(tokenize_string)
+useDynLib(sourcetools)
diff --git a/NEWS.md b/NEWS.md
new file mode 100644
index 0000000..c936e21
--- /dev/null
+++ b/NEWS.md
@@ -0,0 +1,36 @@
+# sourcetools 0.1.5
+
+- Ensure that symbols included from e.g. `<cstdio>`, `<cstring>`
+  are resolved using a `std::` prefix.
+# sourcetools 0.1.4
+
+- More work to ensure `sourcetools` can build on Solaris.
+
+# sourcetools 0.1.3
+
+- Relax C++11 requirement, to ensure that `sourcetools` can
+  build on machines with older compilers (e.g. gcc 4.4).
+  
+# sourcetools 0.1.2
+
+- Disable failing tests on Solaris.
+
+# sourcetools 0.1.1
+
+- Rename token type `ERR` to `INVALID` to fix build errors
+  on Solaris.
+
+# sourcetools 0.1.0
+
+## Features
+
+The first release of `sourcetools` comes with a small set
+of features exposed to R:
+
+- `read(file)`: Read a file (as a string). Similar to
+  `readChar()`, but faster (and maybe be optimized to
+  use a memory mapped file reader in the future).
+
+- `tokenize_file(file)`: Tokenize an R script.
+
+- `tokenize_string(string)`: Tokenize a string of R code.
diff --git a/R/sourcetools.R b/R/sourcetools.R
new file mode 100644
index 0000000..24c5512
--- /dev/null
+++ b/R/sourcetools.R
@@ -0,0 +1,98 @@
+#' @useDynLib sourcetools
+NULL
+
+#' Read the Contents of a File
+#'
+#' Read the contents of a file into a string (or, in the case of
+#' \code{read_lines}, a vector of strings).
+#'
+#' @param path A file path.
+#'
+#' @name read
+#' @rdname read
+#' @export
+read <- function(path) {
+  path <- normalizePath(path, mustWork = TRUE)
+  .Call("sourcetools_read", path, PACKAGE = "sourcetools")
+}
+
+#' @name read
+#' @rdname read
+#' @export
+read_lines <- function(path) {
+  path <- normalizePath(path, mustWork = TRUE)
+  .Call("sourcetools_read_lines", path, PACKAGE = "sourcetools")
+}
+
+#' @name read
+#' @rdname read
+#' @export
+read_bytes <- function(path) {
+  path <- normalizePath(path, mustWork = TRUE)
+  .Call("sourcetools_read_bytes", path, PACKAGE = "sourcetools")
+}
+
+#' @name read
+#' @rdname read
+#' @export
+read_lines_bytes <- function(path) {
+  path <- normalizePath(path, mustWork = TRUE)
+  .Call("sourcetools_read_lines_bytes", path, PACKAGE = "sourcetools")
+}
+
+#' Tokenize R Code
+#'
+#' Tools for tokenizing \R code.
+#'
+#' @param file,path A file path.
+#' @param text,string \R code as a character vector of length one.
+#'
+#' @note Line numbers are determined by existence of the \code{\\n}
+#' line feed character, under the assumption that code being tokenized
+#' will use either \code{\\n} to indicate newlines (as on modern
+#' Unix systems), or \code{\\r\\n} as on Windows.
+#'
+#' @return A \code{data.frame} with the following columns:
+#'
+#' \tabular{ll}{
+#' \code{value}  \tab The token's contents, as a string.     \cr
+#' \code{row}    \tab The row where the token is located.    \cr
+#' \code{column} \tab The column where the token is located. \cr
+#' \code{type}   \tab The token type, as a string.           \cr
+#' }
+#'
+#' @rdname tokenize-methods
+#' @export
+#' @examples
+#' tokenize_string("x <- 1 + 2")
+tokenize_file <- function(path) {
+  path <- normalizePath(path, mustWork = TRUE)
+  .Call("sourcetools_tokenize_file", path, PACKAGE = "sourcetools")
+}
+
+#' @rdname tokenize-methods
+#' @export
+tokenize_string <- function(string) {
+  .Call("sourcetools_tokenize_string", as.character(string), PACKAGE = "sourcetools")
+}
+
+#' @rdname tokenize-methods
+#' @export
+tokenize <- function(file = "", text = NULL) {
+  if (is.null(text))
+    text <- read(file)
+  tokenize_string(text)
+}
+
+#' @export
+print.RTokens <- function(x, ...) {
+  print.data.frame(x, ...)
+}
+
+parse_string <- function(string) {
+  .Call("sourcetools_parse_string", string, PACKAGE = "sourcetools")
+}
+
+parse_file <- function(file) {
+  parse_string(read(file))
+}
diff --git a/R/util.R b/R/util.R
new file mode 100644
index 0000000..f5536b7
--- /dev/null
+++ b/R/util.R
@@ -0,0 +1,57 @@
+.sourcetools <- new.env(parent = emptyenv())
+.sourcetools$gctorture <- TRUE
+
+with_gctorture <- function(expr) {
+  gctorture(.sourcetools$gctorture)
+  result <- expr
+  gctorture(FALSE)
+  result
+}
+
+check_parse <- function(R, S = R) {
+  lhs <- base::parse(text = R, keep.source = FALSE)
+  rhs <- with_gctorture(parse_string(S))
+  check_parse_impl(lhs, rhs)
+}
+
+check_parse_impl <- function(lhs, rhs) {
+
+  lhsType <- typeof(lhs)
+  rhsType <- typeof(rhs)
+
+  onError <- function(format, ...) {
+    message <- c(
+      sprintf(format, ...),
+      sprintf("R: '%s'", deparse(lhs)),
+      sprintf("S: '%s'", deparse(rhs))
+    )
+    stop(paste(message, collapse = "\n"), call. = FALSE)
+  }
+
+  if (lhsType != rhsType)
+    onError("TypeError: '%s' != '%s'", lhsType, rhsType)
+
+  if (length(lhs) != length(rhs))
+    onError("LengthError: %s != %s", length(lhs), length(rhs))
+
+  if (is.call(lhs) || is.expression(lhs)) {
+    lapply(seq_along(lhs), function(i) {
+      check_parse_impl(lhs[[i]], rhs[[i]])
+    })
+  }
+
+  if (!identical(lhs, rhs))
+    onError("IdenticalError: '%s' != '%s'", lhs, rhs)
+
+  TRUE
+}
+
+expect_parse <- function(R, S = R) {
+  testthat::expect_true(check_parse(R, S))
+}
+
+search_objects <- function() {
+  lapply(seq_along(search()), function(i) {
+    ls(pos = i, all.names = TRUE)
+  })
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..37fab58
--- /dev/null
+++ b/README.md
@@ -0,0 +1,81 @@
+
+
+[![Travis-CI Build Status](https://travis-ci.org/kevinushey/sourcetools.svg?branch=master)](https://travis-ci.org/kevinushey/sourcetools) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/kevinushey/sourcetools?branch=master&svg=true)](https://ci.appveyor.com/project/kevinushey/sourcetools)
+
+
+# sourcetools
+
+Tools for reading, tokenizing, and (eventually) parsing `R` code.
+
+## Getting Started
+
+`sourcetools` is not yet on CRAN -- install with
+
+
+```r
+devtools::install_github("kevinushey/sourcetools")
+```
+
+## Reading
+
+`sourcetools` comes with a couple fast functions for reading
+files into `R`.
+
+Use `read()` and `read_lines()` to quickly read a file into
+`R` as character vectors. `read_lines()` handles both Windows
+style `\r\n` line endings, as well as Unix-style `\n` endings.
+
+
+```r
+text <- replicate(10000, paste(sample(letters, 200, TRUE), collapse = ""))
+file <- tempfile()
+cat(text, file = file, sep = "\n")
+mb <- microbenchmark::microbenchmark(times = 10,
+  readChar   = readChar(file, file.info(file)$size, TRUE),
+  readLines  = readLines(file),
+  read       = read(file),
+  read_lines = read_lines(file)
+)
+print(mb, digits = 3)
+```
+
+```
+## Unit: milliseconds
+##        expr   min     lq  mean median     uq    max neval cld
+##    readChar   5.2   6.54  10.5   7.02   8.73  36.56    10 ab 
+##   readLines 155.9 159.69 162.4 161.95 163.15 171.76    10   c
+##        read   5.3   5.48   6.5   5.97   7.52   9.35    10 a  
+##  read_lines  13.5  13.95  14.4  14.09  14.50  16.97    10  b
+```
+
+```r
+unlink(file)
+```
+
+## Tokenization
+
+`sourcetools` provides the `tokenize_string()` and
+`tokenize_file()` functions for generating a tokenized
+representation of R code. These produce 'raw' tokenized
+representations of the code, with each token's value as a
+string, and a recorded row, column, and type:
+
+
+```r
+tokenize_string("if (x < 10) 20")
+```
+
+```
+##    value row column       type
+## 1     if   1      1    keyword
+## 2          1      3 whitespace
+## 3      (   1      4    bracket
+## 4      x   1      5     symbol
+## 5          1      6 whitespace
+## 6      <   1      7   operator
+## 7          1      8 whitespace
+## 8     10   1      9     number
+## 9      )   1     11    bracket
+## 10         1     12 whitespace
+## 11    20   1     13     number
+```
diff --git a/debian/README.test b/debian/README.test
deleted file mode 100644
index 8d70ca3..0000000
--- a/debian/README.test
+++ /dev/null
@@ -1,9 +0,0 @@
-Notes on how this package can be tested.
-────────────────────────────────────────
-
-This package can be tested by running the provided test:
-
-cd tests
-LC_ALL=C R --no-save < testthat.R
-
-in order to confirm its integrity.
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 9c1f893..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-r-cran-sourcetools (0.1.5-1) unstable; urgency=medium
-
-  * Initial release (closes: #842958)
-
- -- Andreas Tille <tille at debian.org>  Wed, 02 Nov 2016 17:20:55 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 45efbe4..0000000
--- a/debian/control
+++ /dev/null
@@ -1,25 +0,0 @@
-Source: r-cran-sourcetools
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: gnu-r
-Priority: optional
-Build-Depends: debhelper (>= 9),
-               dh-r,
-               r-base-dev
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-cran-sourcetools/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-cran-sourcetools/trunk/
-Homepage: https://cran.r-project.org/package=sourcetools
-
-Package: r-cran-sourcetools
-Architecture: any
-Depends: ${R:Depends},
-         ${shlibs:Depends},
-         ${misc:Depends}
-Recommends: ${R:Recommends}
-Suggests: ${R:Suggests}
-Description: tools for reading, tokenizing and parsing R code
- Tools for the reading and tokenization of R code. The
- 'sourcetools' package provides both an R and C++ interface for the tokenization
- of R code, and helpers for interacting with the tokenized representation of R
- code.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 4decd40..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,33 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: sourcetools
-Upstream-Contact: Kevin Ushey <kevinushey at gmail.com>
-Source: https://cran.r-project.org/package=sourcetools
-
-Files: *
-Copyright: 2015-2016 Kevin Ushey
-License: MIT
-
-Files: debian/*
-Copyright: 2016 Andreas Tille <tille at debian.org>
-License: MIT
-
-License: MIT
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- .
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
- .
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index 960011c..0000000
--- a/debian/docs
+++ /dev/null
@@ -1,3 +0,0 @@
-tests
-debian/README.test
-debian/tests/run-unit-test
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 529c38a..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/make -f
-
-%:
-	dh $@ --buildsystem R
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/tests/control b/debian/tests/control
deleted file mode 100644
index b044b0c..0000000
--- a/debian/tests/control
+++ /dev/null
@@ -1,3 +0,0 @@
-Tests: run-unit-test
-Depends: @, r-cran-testthat
-Restrictions: allow-stderr
diff --git a/debian/tests/run-unit-test b/debian/tests/run-unit-test
deleted file mode 100644
index dca0adf..0000000
--- a/debian/tests/run-unit-test
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh -e
-
-oname=sourcetools
-pkg=r-cran-`echo $oname | tr '[A-Z]' '[a-z]'`
-
-if [ "$ADTTMP" = "" ] ; then
-  ADTTMP=`mktemp -d /tmp/${pkg}-test.XXXXXX`
-  trap "rm -rf $ADTTMP" 0 INT QUIT ABRT PIPE TERM
-fi
-cd $ADTTMP
-cp -a /usr/share/doc/${pkg}/tests/* $ADTTMP
-find . -name "*.gz" -exec gunzip \{\} \;
-LC_ALL=C R --no-save < testthat.R
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 3236e50..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=4
-https://cran.r-project.org/src/contrib/sourcetools_([-\d.]*)\.tar\.gz
diff --git a/inst/include/sourcetools.h b/inst/include/sourcetools.h
new file mode 100644
index 0000000..c10c821
--- /dev/null
+++ b/inst/include/sourcetools.h
@@ -0,0 +1,12 @@
+#ifndef SOURCE_TOOLS_H
+#define SOURCE_TOOLS_H
+
+#include <sourcetools/core/core.h>
+#include <sourcetools/platform/platform.h>
+#include <sourcetools/collection/collection.h>
+#include <sourcetools/utf8/utf8.h>
+#include <sourcetools/cursor/cursor.h>
+#include <sourcetools/read/read.h>
+#include <sourcetools/tokenization/tokenization.h>
+
+#endif
diff --git a/inst/include/sourcetools/collection/Position.h b/inst/include/sourcetools/collection/Position.h
new file mode 100644
index 0000000..a0397a4
--- /dev/null
+++ b/inst/include/sourcetools/collection/Position.h
@@ -0,0 +1,77 @@
+#ifndef SOURCETOOLS_COLLECTION_POSITION_H
+#define SOURCETOOLS_COLLECTION_POSITION_H
+
+#include <ostream>
+#include <cstddef>
+
+namespace sourcetools {
+namespace collections {
+
+struct Position
+{
+  Position()
+    : row(0), column(0)
+  {
+  }
+
+  Position(std::size_t row, std::size_t column)
+    : row(row), column(column)
+  {
+  }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const Position& position)
+  {
+    os << position.row << ":" << position.column;
+    return os;
+  }
+
+  friend bool operator <(const Position& lhs, const Position& rhs)
+  {
+    return
+      lhs.row < rhs.row ||
+      (lhs.row == rhs.row && lhs.column < rhs.column);
+  }
+
+  friend bool operator <=(const Position& lhs, const Position& rhs)
+  {
+    return
+      lhs.row < rhs.row ||
+      (lhs.row == rhs.row && lhs.column <= rhs.column);
+  }
+
+  friend bool operator ==(const Position& lhs, const Position& rhs)
+  {
+    return
+      lhs.row == rhs.row &&
+      lhs.column == rhs.column;
+  }
+
+  friend bool operator >(const Position& lhs, const Position& rhs)
+  {
+    return
+      lhs.row > rhs.row ||
+      (lhs.row == rhs.row && lhs.column > rhs.column);
+  }
+
+  friend bool operator >=(const Position& lhs, const Position& rhs)
+  {
+    return
+      lhs.row > rhs.row ||
+      (lhs.row == rhs.row && lhs.column >= rhs.column);
+  }
+
+  friend Position operator +(const Position& lhs, std::size_t rhs)
+  {
+    return Position(lhs.row, lhs.column + rhs);
+  }
+
+  std::size_t row;
+  std::size_t column;
+
+};
+
+} // namespace collections
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_COLLECTION_POSITION_H */
diff --git a/inst/include/sourcetools/collection/Range.h b/inst/include/sourcetools/collection/Range.h
new file mode 100644
index 0000000..36f8f61
--- /dev/null
+++ b/inst/include/sourcetools/collection/Range.h
@@ -0,0 +1,34 @@
+#ifndef SOURCETOOLS_COLLECTION_RANGE_H
+#define SOURCETOOLS_COLLECTION_RANGE_H
+
+#include <ostream>
+#include <sourcetools/collection/Position.h>
+
+namespace sourcetools {
+namespace collections {
+
+class Range
+{
+public:
+  Range(const Position& start, const Position& end)
+    : start_(start), end_(end)
+  {
+  }
+
+  friend std::ostream& operator <<(std::ostream& os, const Range& range)
+  {
+    os << "[" << range.start() << "-" << range.end() << "]";
+    return os;
+  }
+
+  const Position start() const { return start_; }
+  const Position end() const { return end_; }
+
+private:
+  Position start_;
+  Position end_;
+};
+} // namespace collections
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_COLLECTION_RANGE_H */
diff --git a/inst/include/sourcetools/collection/collection.h b/inst/include/sourcetools/collection/collection.h
new file mode 100644
index 0000000..68c99e2
--- /dev/null
+++ b/inst/include/sourcetools/collection/collection.h
@@ -0,0 +1,7 @@
+#ifndef SOURCETOOLS_COLLECTION_COLLECTION_H
+#define SOURCETOOLS_COLLECTION_COLLECTION_H
+
+#include <sourcetools/collection/Position.h>
+#include <sourcetools/collection/Range.h>
+
+#endif /* SOURCETOOLS_COLLECTION_COLLECTION_H */
diff --git a/inst/include/sourcetools/core/core.h b/inst/include/sourcetools/core/core.h
new file mode 100644
index 0000000..5b6bed3
--- /dev/null
+++ b/inst/include/sourcetools/core/core.h
@@ -0,0 +1,7 @@
+#ifndef SOURCETOOLS_CORE_CORE_H
+#define SOURCETOOLS_CORE_CORE_H
+
+#include <sourcetools/core/macros.h>
+#include <sourcetools/core/util.h>
+
+#endif /* SOURCETOOLS_CORE_CORE_H */
diff --git a/inst/include/sourcetools/core/macros.h b/inst/include/sourcetools/core/macros.h
new file mode 100644
index 0000000..57c4400
--- /dev/null
+++ b/inst/include/sourcetools/core/macros.h
@@ -0,0 +1,72 @@
+#ifndef SOURCETOOLS_CORE_MACROS_H
+#define SOURCETOOLS_CORE_MACROS_H
+
+#include <cstdio>
+
+#include <string>
+#include <iostream>
+
+/* Utility */
+#ifdef __GNUC__
+# define LIKELY(x)   __builtin_expect(!!(x), 1)
+# define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+# define LIKELY(x) x
+# define UNLIKELY(x) x
+#endif
+
+#define SOURCE_TOOLS_CHECK_MASK(__SELF__, __MASK__)                    \
+  ((__MASK__ & __SELF__) == __MASK__)
+
+#define SOURCE_TOOLS_LOWER_BITS(__VALUE__, __BITS__)                   \
+  (((1 << __BITS__) - 1) & __VALUE__)
+
+#define SOURCE_TOOLS_PASTE(__X__, __Y__) __X__ ## __Y__
+#define SOURCE_TOOLS_STRINGIFY(__X__) #__X__
+
+/* Logging */
+namespace sourcetools {
+namespace debug {
+
+inline std::string shortFilePath(const std::string& filePath)
+{
+  std::string::size_type index = filePath.find_last_of("/");
+  if (index != std::string::npos)
+    return filePath.substr(index + 1);
+  return filePath;
+}
+
+inline std::string debugPosition(const char* filePath, int line)
+{
+  static const int N = 1024;
+  char buffer[N + 1];
+  std::string shortPath = shortFilePath(filePath);
+  if (shortPath.size() > N / 2)
+    shortPath = shortPath.substr(0, N / 2);
+  std::sprintf(buffer, "[%s:%4i]", shortPath.c_str(), line);
+  return buffer;
+}
+
+} // namespace debug
+} // namespace sourcetools
+
+// Flip on/off as necessary
+#define SOURCE_TOOLS_ENABLE_DEBUG_LOGGING
+
+#ifdef SOURCE_TOOLS_ENABLE_DEBUG_LOGGING
+
+#include <iostream>
+
+#define DEBUG(__X__)                                                   \
+  std::cerr << ::sourcetools::debug::debugPosition(__FILE__, __LINE__) \
+            << ": " << __X__ << ::std::endl;
+#define DEBUG_BLOCK(x)
+
+#else
+
+#define DEBUG(x)
+#define DEBUG_BLOCK(x) if (false)
+
+#endif
+
+#endif /* SOURCETOOLS_CORE_MACROS_H */
diff --git a/inst/include/sourcetools/core/util.h b/inst/include/sourcetools/core/util.h
new file mode 100644
index 0000000..8d16cfd
--- /dev/null
+++ b/inst/include/sourcetools/core/util.h
@@ -0,0 +1,142 @@
+#ifndef SOURCETOOLS_CORE_UTIL_H
+#define SOURCETOOLS_CORE_UTIL_H
+
+#include <string>
+#include <memory>
+#include <cctype>
+#include <cstdlib>
+
+namespace sourcetools {
+namespace detail {
+
+class noncopyable
+{
+protected:
+  noncopyable() {}
+  ~noncopyable() {}
+
+private:
+  noncopyable(const noncopyable&);
+  noncopyable& operator=(const noncopyable&);
+};
+
+} // namespace detail
+typedef detail::noncopyable noncopyable;
+
+template <typename T>
+class scoped_ptr : noncopyable
+{
+public:
+  explicit scoped_ptr(T* pData) : pData_(pData) {}
+  T& operator*() const { return *pData_; }
+  T* operator->() const { return pData_; }
+  operator T*() const { return pData_; }
+  ~scoped_ptr() { delete pData_; }
+private:
+  T* pData_;
+};
+
+template <typename T>
+class scoped_array : noncopyable
+{
+public:
+  explicit scoped_array(T* pData) : pData_(pData) {}
+  T& operator*() const { return *pData_; }
+  T* operator->() const { return pData_; }
+  operator T*() const { return pData_; }
+  ~scoped_array() { delete[] pData_; }
+private:
+  T* pData_;
+};
+
+namespace utils {
+
+inline bool isWhitespace(char ch)
+{
+  return
+    ch == ' ' ||
+    ch == '\f' ||
+    ch == '\r' ||
+    ch == '\n' ||
+    ch == '\t' ||
+    ch == '\v';
+}
+
+template <typename T>
+inline bool countWhitespaceBytes(const char* data,
+                                 T* pBytes)
+{
+  T bytes = 0;
+  while (isWhitespace(*data)) {
+    ++data;
+    ++bytes;
+  }
+
+  *pBytes = bytes;
+  return bytes != 0;
+}
+
+inline bool isDigit(char ch)
+{
+  return
+    (ch >= '0' && ch <= '9');
+}
+
+inline bool isAlphabetic(char ch)
+{
+  return
+    (ch >= 'a' && ch <= 'z') ||
+    (ch >= 'A' && ch <= 'Z');
+}
+
+inline bool isAlphaNumeric(char ch)
+{
+  return
+    (ch >= 'a' && ch <= 'z') ||
+    (ch >= 'A' && ch <= 'Z') ||
+    (ch >= '0' && ch <= '9');
+}
+
+inline bool isHexDigit(char ch)
+{
+  return
+    (ch >= '0' && ch <= '9') ||
+    (ch >= 'a' && ch <= 'f') ||
+    (ch >= 'A' && ch <= 'F');
+}
+
+inline bool isValidForStartOfRSymbol(char ch)
+{
+  return
+    isAlphabetic(ch) ||
+    ch == '.' ||
+    ch < 0;
+}
+
+inline bool isValidForRSymbol(char ch)
+{
+  return
+    isAlphaNumeric(ch) ||
+    ch == '.' ||
+    ch == '_' ||
+    ch < 0;
+}
+
+inline std::string escape(char ch)
+{
+  switch (ch) {
+  case '\r':
+    return "\\r";
+  case '\n':
+    return "\\n";
+  case '\t':
+    return "\\t";
+  default:
+    return std::string(1, ch);
+  }
+}
+
+} // namespace utils
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_CORE_UTIL_H */
diff --git a/inst/include/sourcetools/cursor/TextCursor.h b/inst/include/sourcetools/cursor/TextCursor.h
new file mode 100644
index 0000000..1a5a212
--- /dev/null
+++ b/inst/include/sourcetools/cursor/TextCursor.h
@@ -0,0 +1,66 @@
+#ifndef SOURCETOOLS_CURSOR_TEXT_CURSOR_H
+#define SOURCETOOLS_CURSOR_TEXT_CURSOR_H
+
+#include <string>
+
+#include <sourcetools/core/macros.h>
+#include <sourcetools/collection/Position.h>
+
+namespace sourcetools {
+namespace cursors {
+
+class TextCursor
+{
+public:
+
+  TextCursor(const char* text, std::size_t n)
+      : text_(text),
+        n_(n),
+        offset_(0),
+        position_(0, 0)
+  {
+  }
+
+  char peek(std::size_t offset = 0)
+  {
+    std::size_t index = offset_ + offset;
+    if (UNLIKELY(index >= n_))
+      return '\0';
+    return text_[index];
+  }
+
+  void advance(std::size_t times = 1)
+  {
+    for (std::size_t i = 0; i < times; ++i) {
+      if (peek() == '\n') {
+        ++position_.row;
+        position_.column = 0;
+      } else {
+        ++position_.column;
+      }
+      ++offset_;
+    }
+  }
+
+  operator const char*() const { return text_ + offset_; }
+
+  std::size_t offset() const { return offset_; }
+
+  const collections::Position& position() const { return position_; }
+  std::size_t row() const { return position_.row; }
+  std::size_t column() const { return position_.column; }
+
+  const char* begin() const { return text_; }
+  const char* end() const { return text_ + n_; }
+
+private:
+  const char* text_;
+  std::size_t n_;
+  std::size_t offset_;
+  collections::Position position_;
+};
+
+} // namespace cursors
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_CURSOR_TEXT_CURSOR_H */
diff --git a/inst/include/sourcetools/cursor/TokenCursor.h b/inst/include/sourcetools/cursor/TokenCursor.h
new file mode 100644
index 0000000..e0f3025
--- /dev/null
+++ b/inst/include/sourcetools/cursor/TokenCursor.h
@@ -0,0 +1,321 @@
+#ifndef SOURCETOOLS_CURSOR_TOKEN_CURSOR_H
+#define SOURCETOOLS_CURSOR_TOKEN_CURSOR_H
+
+#include <cstring>
+#include <algorithm>
+
+#include <sourcetools/collection/Position.h>
+#include <sourcetools/tokenization/Token.h>
+
+namespace sourcetools {
+namespace cursors {
+
+class TokenCursor {
+
+private:
+  typedef collections::Position Position;
+  typedef tokens::Token Token;
+
+public:
+
+  TokenCursor(const std::vector<Token>& tokens)
+    : tokens_(tokens),
+      offset_(0),
+      n_(tokens.size()),
+      noSuchToken_(tokens::END)
+  {}
+
+  bool moveToNextToken()
+  {
+    if (UNLIKELY(offset_ >= n_ - 1))
+      return false;
+
+    ++offset_;
+    return true;
+  }
+
+  bool moveToNextSignificantToken()
+  {
+    if (!moveToNextToken())
+      return false;
+
+    if (!fwdOverWhitespaceAndComments())
+      return false;
+
+    return true;
+  }
+
+  bool moveToPreviousToken()
+  {
+    if (UNLIKELY(offset_ == 0))
+      return false;
+
+    --offset_;
+    return true;
+  }
+
+  bool moveToPreviousSignificantToken()
+  {
+    if (!moveToPreviousToken())
+      return false;
+
+    if (!bwdOverWhitespaceAndComments())
+      return false;
+
+    return true;
+  }
+
+  const Token& peekFwd(std::size_t offset = 1) const
+  {
+    std::size_t index = offset_ + offset;
+    if (UNLIKELY(index >= n_))
+      return noSuchToken_;
+
+    return tokens_[index];
+  }
+
+  const Token& peekBwd(std::size_t offset = 1) const
+  {
+    if (UNLIKELY(offset > offset_))
+      return noSuchToken_;
+
+    std::size_t index = offset_ - offset;
+    return tokens_[index];
+  }
+
+  const Token& currentToken() const
+  {
+    if (UNLIKELY(offset_ >= n_))
+      return noSuchToken_;
+    return tokens_[offset_];
+  }
+
+  operator const Token&() const { return currentToken(); }
+
+  bool fwdOverWhitespace()
+  {
+    while (isType(tokens::WHITESPACE))
+      if (!moveToNextToken())
+        return false;
+    return true;
+  }
+
+  bool bwdOverWhitespace()
+  {
+    while (isType(tokens::WHITESPACE))
+      if (!moveToPreviousToken())
+        return false;
+      return true;
+  }
+
+  bool fwdOverComments()
+  {
+    while (isType(tokens::COMMENT))
+      if (!moveToNextToken())
+        return false;
+    return true;
+  }
+
+  bool bwdOverComments()
+  {
+    while (isType(tokens::COMMENT))
+      if (!moveToPreviousToken())
+        return false;
+    return true;
+  }
+
+  bool fwdOverWhitespaceAndComments()
+  {
+    while (isType(tokens::COMMENT) || isType(tokens::WHITESPACE))
+      if (!moveToNextToken())
+        return false;
+    return true;
+  }
+
+  bool bwdOverWhitespaceAndComments()
+  {
+    while (isType(tokens::COMMENT) || isType(tokens::WHITESPACE))
+      if (!moveToPreviousToken())
+        return false;
+    return true;
+  }
+
+  const Token& nextSignificantToken(std::size_t times = 1) const
+  {
+    TokenCursor clone(*this);
+    for (std::size_t i = 0; i < times; ++i)
+      clone.moveToNextSignificantToken();
+    return clone;
+  }
+
+  const Token& previousSignificantToken(std::size_t times = 1) const
+  {
+    TokenCursor clone(*this);
+    for (std::size_t i = 0; i < times; ++i)
+      clone.moveToPreviousSignificantToken();
+    return clone;
+  }
+
+  bool moveToPosition(std::size_t row, std::size_t column)
+  {
+    return moveToPosition(Position(row, column));
+  }
+
+  bool moveToPosition(const Position& target)
+  {
+    if (UNLIKELY(n_ == 0))
+      return false;
+
+    if (UNLIKELY(tokens_[n_ - 1].position() <= target))
+    {
+      offset_ = n_ - 1;
+      return true;
+    }
+
+    std::size_t start  = 0;
+    std::size_t end    = n_;
+
+    std::size_t offset = 0;
+    while (true)
+    {
+      offset = (start + end) / 2;
+      const Position& current = tokens_[offset].position();
+
+      if (current == target || start == end)
+        break;
+      else if (current < target)
+        start = offset + 1;
+      else
+        end = offset - 1;
+    }
+
+    offset_ = offset;
+    return true;
+  }
+
+  template <typename F>
+  bool findFwd(F f)
+  {
+    do {
+      if (f(this))
+        return true;
+    } while (moveToNextToken());
+
+    return false;
+  }
+
+  template <typename F>
+  bool findBwd(F f)
+  {
+    do {
+      if (f(this))
+        return true;
+    } while (moveToPreviousToken());
+
+    return false;
+  }
+
+  bool findFwd(const char* contents)
+  {
+    return findFwd(std::string(contents, std::strlen(contents)));
+  }
+
+  bool findFwd(const std::string& contents)
+  {
+    do {
+      if (currentToken().contentsEqual(contents))
+        return true;
+    } while (moveToNextToken());
+
+    return false;
+  }
+
+  bool findBwd(const char* contents)
+  {
+    return findBwd(std::string(contents, std::strlen(contents)));
+  }
+
+  bool findBwd(const std::string& contents)
+  {
+    do {
+      if (currentToken().contentsEqual(contents))
+        return true;
+    } while (moveToPreviousToken());
+
+    return false;
+  }
+
+  bool fwdToMatchingBracket()
+  {
+    using namespace tokens;
+    if (!isLeftBracket(currentToken()))
+      return false;
+
+    TokenType lhs = currentToken().type();
+    TokenType rhs = complement(lhs);
+    std::size_t balance = 1;
+
+    while (moveToNextSignificantToken())
+    {
+      TokenType type = currentToken().type();
+      balance += type == lhs;
+      balance -= type == rhs;
+      if (balance == 0) return true;
+    }
+
+    return false;
+  }
+
+  bool bwdToMatchingBracket()
+  {
+    using namespace tokens;
+    if (!isRightBracket(currentToken()))
+      return false;
+
+    TokenType lhs = currentToken().type();
+    TokenType rhs = complement(lhs);
+    std::size_t balance = 1;
+
+    while (moveToPreviousSignificantToken())
+    {
+      TokenType type = currentToken().type();
+      balance += type == lhs;
+      balance -= type == rhs;
+      if (balance == 0) return true;
+    }
+
+    return false;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const TokenCursor& cursor)
+  {
+    return os << toString(cursor.currentToken());
+  }
+
+  tokens::TokenType type() const { return currentToken().type(); }
+  bool isType(tokens::TokenType type) const { return currentToken().isType(type); }
+  collections::Position position() const { return currentToken().position(); }
+  std::size_t offset() const { return offset_; }
+  std::size_t row() const { return currentToken().row(); }
+  std::size_t column() const { return currentToken().column(); }
+
+
+private:
+
+  const std::vector<Token>& tokens_;
+  std::size_t offset_;
+  std::size_t n_;
+  Token noSuchToken_;
+
+};
+
+} // namespace cursors
+
+inline std::string toString(const cursors::TokenCursor& cursor)
+{
+  return toString(cursor.currentToken());
+}
+
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_CURSOR_TOKEN_CURSOR_H */
diff --git a/inst/include/sourcetools/cursor/cursor.h b/inst/include/sourcetools/cursor/cursor.h
new file mode 100644
index 0000000..e0b1cdd
--- /dev/null
+++ b/inst/include/sourcetools/cursor/cursor.h
@@ -0,0 +1,7 @@
+#ifndef SOURCETOOLS_CURSOR_CURSOR_H
+#define SOURCETOOLS_CURSOR_CURSOR_H
+
+#include <sourcetools/cursor/TextCursor.h>
+#include <sourcetools/cursor/TokenCursor.h>
+
+#endif /* SOURCETOOLS_CURSOR_CURSOR_H */
diff --git a/inst/include/sourcetools/multibyte/multibyte.h b/inst/include/sourcetools/multibyte/multibyte.h
new file mode 100644
index 0000000..f2c34c4
--- /dev/null
+++ b/inst/include/sourcetools/multibyte/multibyte.h
@@ -0,0 +1,41 @@
+#ifndef SOURCETOOLS_MULTIBYTE_MULTIBYTE_H
+#define SOURCETOOLS_MULTIBYTE_MULTIBYTE_H
+
+#include <cstdlib>
+#include <cwchar>
+
+namespace sourcetools {
+namespace multibyte {
+
+template <typename T>
+inline bool countWhitespaceBytes(const char* data,
+                                 T* pBytes)
+{
+  wchar_t ch;
+  T bytes = 0;
+  const char* it = data;
+
+  while (true) {
+
+    int status = std::mbtowc(&ch, it, MB_CUR_MAX);
+    if (status == 0) {
+      break;
+    } else if (status == -1) {
+      break;
+    }
+
+    if (!std::iswspace(ch))
+      break;
+
+    bytes += status;
+    it += status;
+  }
+
+  *pBytes = bytes;
+  return bytes != 0;
+}
+
+} // namespace multibyte
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_MULTIBYTE_MULTIBYTE_H */
diff --git a/inst/include/sourcetools/platform/platform.h b/inst/include/sourcetools/platform/platform.h
new file mode 100644
index 0000000..2f6d0c2
--- /dev/null
+++ b/inst/include/sourcetools/platform/platform.h
@@ -0,0 +1,20 @@
+#ifndef SOURCETOOLS_PLATFORM_PLATFORM_H
+#define SOURCETOOLS_PLATFORM_PLATFORM_H
+
+#ifdef _WIN32
+# define SOURCETOOLS_PLATFORM_WINDOWS
+#endif
+
+#ifdef __APPLE__
+# define SOURCETOOLS_PLATFORM_MACOS
+#endif
+
+#ifdef __linux__
+# define SOURCETOOLS_PLATFORM_LINUX
+#endif
+
+#if defined(__sun) && defined(__SVR4)
+# define SOURCETOOLS_PLATFORM_SOLARIS
+#endif
+
+#endif /* SOURCETOOLS_PLATFORM_PLATFORM_H */
diff --git a/inst/include/sourcetools/r/RCallRecurser.h b/inst/include/sourcetools/r/RCallRecurser.h
new file mode 100644
index 0000000..6c55f83
--- /dev/null
+++ b/inst/include/sourcetools/r/RCallRecurser.h
@@ -0,0 +1,75 @@
+#ifndef SOURCETOOLS_R_R_CALL_RECURSER_H
+#define SOURCETOOLS_R_R_CALL_RECURSER_H
+
+#include <vector>
+
+#include <sourcetools/core/core.h>
+
+#include <sourcetools/r/RHeaders.h>
+#include <sourcetools/r/RFunctions.h>
+
+
+namespace sourcetools {
+namespace r {
+
+class CallRecurser : noncopyable
+{
+public:
+
+  class Operation
+  {
+  public:
+    virtual void apply(SEXP dataSEXP) = 0;
+    virtual ~Operation() {}
+  };
+
+  explicit CallRecurser(SEXP dataSEXP)
+  {
+    if (Rf_isPrimitive(dataSEXP))
+      dataSEXP_ = R_NilValue;
+    else if (Rf_isFunction(dataSEXP))
+      dataSEXP_ = r::util::functionBody(dataSEXP);
+    else if (TYPEOF(dataSEXP) == LANGSXP)
+      dataSEXP_ = dataSEXP;
+    else
+      dataSEXP_ = R_NilValue;
+  }
+
+  void add(Operation* pOperation)
+  {
+    operations_.push_back(pOperation);
+  }
+
+  void run()
+  {
+    runImpl(dataSEXP_);
+  }
+
+  void runImpl(SEXP dataSEXP)
+  {
+    for (std::vector<Operation*>::iterator it = operations_.begin();
+         it != operations_.end();
+         ++it)
+    {
+      (*it)->apply(dataSEXP);
+    }
+
+    if (TYPEOF(dataSEXP) == LANGSXP)
+    {
+      while (dataSEXP != R_NilValue)
+      {
+        runImpl(CAR(dataSEXP));
+        dataSEXP = CDR(dataSEXP);
+      }
+    }
+  }
+
+private:
+  SEXP dataSEXP_;
+  std::vector<Operation*> operations_;
+};
+
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_CALL_RECURSER_H */
diff --git a/inst/include/sourcetools/r/RConverter.h b/inst/include/sourcetools/r/RConverter.h
new file mode 100644
index 0000000..7b1b7dd
--- /dev/null
+++ b/inst/include/sourcetools/r/RConverter.h
@@ -0,0 +1,39 @@
+#ifndef SOURCETOOLS_R_R_CONVERTER_H
+#define SOURCETOOLS_R_R_CONVERTER_H
+
+#include <vector>
+#include <string>
+
+#include <sourcetools/r/RUtils.h>
+#include <sourcetools/r/RHeaders.h>
+
+namespace sourcetools {
+namespace r {
+
+inline SEXP Rf_mkChar(const std::string& data)
+{
+  return Rf_mkCharLen(data.c_str(), data.size());
+}
+
+inline SEXP Rf_mkString(const std::string& data)
+{
+  Protect protect;
+  SEXP resultSEXP = protect(Rf_allocVector(STRSXP, 1));
+  SET_STRING_ELT(resultSEXP, 0, Rf_mkChar(data));
+  return resultSEXP;
+}
+
+inline SEXP create(const std::vector<std::string>& vector)
+{
+  Protect protect;
+  std::size_t n = vector.size();
+  SEXP resultSEXP = protect(Rf_allocVector(STRSXP, n));
+  for (std::size_t i = 0; i < n; ++i)
+    SET_STRING_ELT(resultSEXP, i, Rf_mkChar(vector[i]));
+  return resultSEXP;
+}
+
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_CONVERTER_H */
diff --git a/inst/include/sourcetools/r/RFunctions.h b/inst/include/sourcetools/r/RFunctions.h
new file mode 100644
index 0000000..109ab44
--- /dev/null
+++ b/inst/include/sourcetools/r/RFunctions.h
@@ -0,0 +1,85 @@
+#ifndef SOURCETOOLS_R_R_FUNCTIONS_H
+#define SOURCETOOLS_R_R_FUNCTIONS_H
+
+#include <string>
+#include <set>
+
+#include <sourcetools/r/RUtils.h>
+
+namespace sourcetools {
+namespace r {
+
+inline SEXP eval(const std::string& fn, SEXP envSEXP = NULL)
+{
+  Protect protect;
+  if (envSEXP == NULL)
+  {
+    SEXP strSEXP = protect(Rf_mkString("sourcetools"));
+    envSEXP = R_FindNamespace(strSEXP);
+  }
+
+  SEXP callSEXP = protect(Rf_lang1(Rf_install(fn.c_str())));
+  SEXP resultSEXP = protect(Rf_eval(callSEXP, envSEXP));
+  return resultSEXP;
+}
+
+inline std::set<std::string> objectsOnSearchPath()
+{
+  std::set<std::string> results;
+  Protect protect;
+
+  SEXP objectsSEXP;
+  protect(objectsSEXP = eval("search_objects"));
+
+  for (R_xlen_t i = 0; i < Rf_length(objectsSEXP); ++i)
+  {
+    SEXP strSEXP = VECTOR_ELT(objectsSEXP, i);
+    for (R_xlen_t j = 0; j < Rf_length(strSEXP); ++j)
+    {
+      SEXP charSEXP = STRING_ELT(strSEXP, j);
+      std::string element(CHAR(charSEXP), Rf_length(charSEXP));
+      results.insert(element);
+    }
+  }
+
+  return results;
+}
+
+namespace util {
+
+inline void setNames(SEXP dataSEXP, const char** names, std::size_t n)
+{
+  RObjectFactory factory;
+  SEXP namesSEXP = factory.create(STRSXP, n);
+  for (std::size_t i = 0; i < n; ++i)
+    SET_STRING_ELT(namesSEXP, i, Rf_mkChar(names[i]));
+  Rf_setAttrib(dataSEXP, R_NamesSymbol, namesSEXP);
+}
+
+inline void listToDataFrame(SEXP listSEXP, int n)
+{
+  r::Protect protect;
+  SEXP classSEXP = protect(Rf_mkString("data.frame"));
+  Rf_setAttrib(listSEXP, R_ClassSymbol, classSEXP);
+
+  SEXP rownamesSEXP = protect(Rf_allocVector(INTSXP, 2));
+  INTEGER(rownamesSEXP)[0] = NA_INTEGER;
+  INTEGER(rownamesSEXP)[1] = -n;
+  Rf_setAttrib(listSEXP, R_RowNamesSymbol, rownamesSEXP);
+}
+
+inline SEXP functionBody(SEXP fnSEXP)
+{
+  SEXP bodyFunctionSEXP = Rf_findFun(Rf_install("body"), R_BaseNamespace);
+
+  r::Protect protect;
+  SEXP callSEXP = protect(Rf_lang2(bodyFunctionSEXP, fnSEXP));
+  return Rf_eval(callSEXP, R_BaseNamespace);
+}
+
+} // namespace util
+
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_FUNCTIONS_H */
diff --git a/inst/include/sourcetools/r/RHeaders.h b/inst/include/sourcetools/r/RHeaders.h
new file mode 100644
index 0000000..89e2130
--- /dev/null
+++ b/inst/include/sourcetools/r/RHeaders.h
@@ -0,0 +1,8 @@
+#ifndef SOURCETOOLS_R_R_HEADERS_H
+#define SOURCETOOLS_R_R_HEADERS_H
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#endif /* SOURCETOOLS_R_R_HEADERS_H */
diff --git a/inst/include/sourcetools/r/RNonStandardEvaluation.h b/inst/include/sourcetools/r/RNonStandardEvaluation.h
new file mode 100644
index 0000000..cb7f7df
--- /dev/null
+++ b/inst/include/sourcetools/r/RNonStandardEvaluation.h
@@ -0,0 +1,149 @@
+#ifndef SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H
+#define SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H
+
+#include <set>
+#include <map>
+
+#include <sourcetools/r/RHeaders.h>
+#include <sourcetools/r/RCallRecurser.h>
+
+namespace sourcetools {
+namespace r {
+namespace nse {
+
+namespace detail {
+
+inline std::set<std::string> makeNsePrimitives()
+{
+  std::set<std::string> instance;
+
+  instance.insert("quote");
+  instance.insert("substitute");
+  instance.insert("eval");
+  instance.insert("evalq");
+  instance.insert("lazy_dots");
+
+  return instance;
+}
+
+inline std::set<std::string>& nsePrimitives()
+{
+  static std::set<std::string> instance = makeNsePrimitives();
+  return instance;
+}
+
+class PerformsNonStandardEvaluationOperation
+  : public r::CallRecurser::Operation
+{
+public:
+
+  PerformsNonStandardEvaluationOperation()
+    : status_(false)
+  {
+  }
+
+  virtual void apply(SEXP dataSEXP)
+  {
+    if (status_ || TYPEOF(dataSEXP) != LANGSXP)
+      return;
+
+    if ((status_ = checkCall(dataSEXP)))
+      return;
+
+    SEXP fnSEXP = CAR(dataSEXP);
+    if (TYPEOF(fnSEXP) == SYMSXP)
+      status_ = nsePrimitives().count(CHAR(PRINTNAME(fnSEXP)));
+    else if (TYPEOF(fnSEXP) == STRSXP)
+      status_ = nsePrimitives().count(CHAR(STRING_ELT(fnSEXP, 0)));
+
+  }
+
+  bool status() const { return status_; }
+
+private:
+
+  bool checkCall(SEXP callSEXP)
+  {
+    std::size_t n = Rf_length(callSEXP);
+    if (n == 0)
+      return false;
+
+    SEXP fnSEXP = CAR(callSEXP);
+    if (fnSEXP == Rf_install("::") || fnSEXP == Rf_install(":::"))
+    {
+      SEXP lhsSEXP = CADR(callSEXP);
+      SEXP rhsSEXP = CADDR(callSEXP);
+
+      if (lhsSEXP == Rf_install("lazyeval") && rhsSEXP == Rf_install("lazy_dots"))
+        return true;
+    }
+
+    return false;
+  }
+
+private:
+  bool status_;
+};
+
+} // namespace detail
+
+class Database
+{
+public:
+  bool check(SEXP dataSEXP)
+  {
+    if (contains(dataSEXP))
+      return get(dataSEXP);
+
+    typedef detail::PerformsNonStandardEvaluationOperation Operation;
+    scoped_ptr<Operation> operation(new Operation);
+
+    r::CallRecurser recurser(dataSEXP);
+    recurser.add(operation);
+    recurser.run();
+
+    set(dataSEXP, operation->status());
+    return operation->status();
+  }
+
+private:
+
+  bool contains(SEXP dataSEXP)
+  {
+    return map_.count(address(dataSEXP));
+  }
+
+  bool get(SEXP dataSEXP)
+  {
+    return map_[address(dataSEXP)];
+  }
+
+  void set(SEXP dataSEXP, bool value)
+  {
+    map_[address(dataSEXP)] = value;
+  }
+
+  std::size_t address(SEXP dataSEXP)
+  {
+    return reinterpret_cast<std::size_t>(dataSEXP);
+  }
+
+  std::map<std::size_t, bool> map_;
+};
+
+inline Database& database()
+{
+  static Database instance;
+  return instance;
+}
+
+inline bool performsNonStandardEvaluation(SEXP fnSEXP)
+{
+  return database().check(fnSEXP);
+}
+
+} // namespace nse
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H */
diff --git a/inst/include/sourcetools/r/RUtils.h b/inst/include/sourcetools/r/RUtils.h
new file mode 100644
index 0000000..8fcc9e4
--- /dev/null
+++ b/inst/include/sourcetools/r/RUtils.h
@@ -0,0 +1,100 @@
+#ifndef SOURCETOOLS_R_R_UTILS_H
+#define SOURCETOOLS_R_R_UTILS_H
+
+#include <vector>
+
+#include <sourcetools/core/core.h>
+
+#include <sourcetools/r/RHeaders.h>
+
+namespace sourcetools {
+namespace r {
+
+class Protect : noncopyable
+{
+public:
+  Protect(): n_(0) {}
+  ~Protect() { UNPROTECT(n_); }
+
+  SEXP operator()(SEXP objectSEXP)
+  {
+    ++n_;
+    return PROTECT(objectSEXP);
+  }
+
+private:
+  int n_;
+};
+
+class RObjectFactory : noncopyable
+{
+public:
+
+  RObjectFactory()
+    : n_(0)
+  {
+  }
+
+  template <typename T, typename F>
+  SEXP create(SEXPTYPE type, const std::vector<T>& vector, F f)
+  {
+    ++n_;
+    std::size_t n = vector.size();
+    SEXP resultSEXP = PROTECT(Rf_allocVector(type, n));
+    for (std::size_t i = 0; i < n; ++i)
+      f(resultSEXP, i, vector[i]);
+    return resultSEXP;
+  }
+
+  SEXP create(SEXPTYPE type, std::size_t n)
+  {
+    ++n_;
+    return PROTECT(Rf_allocVector(type, n));
+  }
+
+  ~RObjectFactory()
+  {
+    UNPROTECT(n_);
+  }
+
+private:
+  std::size_t n_;
+};
+
+class ListBuilder : noncopyable
+{
+public:
+
+  void add(const std::string& name, SEXP value)
+  {
+    names_.push_back(name);
+    data_.push_back(protect_(value));
+  }
+
+  operator SEXP() const
+  {
+    std::size_t n = data_.size();
+
+    SEXP resultSEXP = protect_(Rf_allocVector(VECSXP, n));
+    SEXP namesSEXP  = protect_(Rf_allocVector(STRSXP, n));
+
+    for (std::size_t i = 0; i < n; ++i)
+    {
+      SET_VECTOR_ELT(resultSEXP, i, data_[i]);
+      SET_STRING_ELT(namesSEXP, i, Rf_mkCharLen(names_[i].c_str(), names_[i].size()));
+    }
+
+    Rf_setAttrib(resultSEXP, R_NamesSymbol, namesSEXP);
+    return resultSEXP;
+  }
+
+private:
+  std::vector<std::string> names_;
+  std::vector<SEXP> data_;
+  mutable Protect protect_;
+};
+
+} // namespace r
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_R_R_UTILS_H */
diff --git a/inst/include/sourcetools/r/r.h b/inst/include/sourcetools/r/r.h
new file mode 100644
index 0000000..1076272
--- /dev/null
+++ b/inst/include/sourcetools/r/r.h
@@ -0,0 +1,11 @@
+#ifndef SOURCETOOLS_R_R_H
+#define SOURCETOOLS_R_R_H
+
+#include <sourcetools/r/RHeaders.h>
+#include <sourcetools/r/RUtils.h>
+#include <sourcetools/r/RConverter.h>
+#include <sourcetools/r/RFunctions.h>
+#include <sourcetools/r/RCallRecurser.h>
+#include <sourcetools/r/RNonStandardEvaluation.h>
+
+#endif /* SOURCETOOLS_R_R_H */
diff --git a/inst/include/sourcetools/read/MemoryMappedReader.h b/inst/include/sourcetools/read/MemoryMappedReader.h
new file mode 100644
index 0000000..a541379
--- /dev/null
+++ b/inst/include/sourcetools/read/MemoryMappedReader.h
@@ -0,0 +1,139 @@
+#ifndef SOURCETOOLS_READ_MEMORY_MAPPED_READER_H
+#define SOURCETOOLS_READ_MEMORY_MAPPED_READER_H
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include <sourcetools/core/macros.h>
+
+#include <sourcetools/r/RHeaders.h>
+#include <sourcetools/r/RUtils.h>
+
+#ifndef _WIN32
+# include <sourcetools/read/posix/FileConnection.h>
+# include <sourcetools/read/posix/MemoryMappedConnection.h>
+#else
+# include <sourcetools/read/windows/FileConnection.h>
+# include <sourcetools/read/windows/MemoryMappedConnection.h>
+#endif
+
+namespace sourcetools {
+namespace detail {
+
+class MemoryMappedReader
+{
+public:
+
+  class VectorReader
+  {
+  public:
+
+    explicit VectorReader(std::vector<std::string>* pData)
+      : pData_(pData)
+    {
+    }
+
+    template <typename T>
+    void operator()(const T& lhs, const T& rhs)
+    {
+      pData_->push_back(std::string(lhs, rhs));
+    }
+
+  private:
+    std::vector<std::string>* pData_;
+  };
+
+  static bool read(const char* path, std::string* pContent)
+  {
+    // Open file connection
+    FileConnection conn(path);
+    if (!conn.open())
+      return false;
+
+    // Get size of file
+    std::size_t size;
+    if (!conn.size(&size))
+      return false;
+
+    // Early return for empty files
+    if (UNLIKELY(size == 0))
+      return true;
+
+    // mmap the file
+    MemoryMappedConnection map(conn, size);
+    if (!map.open())
+      return false;
+
+    pContent->assign(map, size);
+    return true;
+  }
+
+  template <typename F>
+  static bool read_lines(const char* path, F f)
+  {
+    FileConnection conn(path);
+    if (!conn.open())
+      return false;
+
+    // Get size of file
+    std::size_t size;
+    if (!conn.size(&size))
+      return false;
+
+    // Early return for empty files
+    if (UNLIKELY(size == 0))
+      return true;
+
+    // mmap the file
+    MemoryMappedConnection map(conn, size);
+    if (!map.open())
+      return false;
+
+    // special case: just a '\n'
+    bool endsWithNewline = map[size - 1] == '\n';
+    if (size == 1 && endsWithNewline)
+      return true;
+
+    // Search for newlines
+    const char* lower = map;
+    const char* upper = map;
+    const char* end = map + size;
+    while (true)
+    {
+      upper = std::find(lower, end, '\n');
+      if (upper == end)
+        break;
+
+      // Handle '\r\n'
+      int CR = *(upper - 1) == '\r';
+      upper -= CR;
+
+      // Pass to functor
+      f(lower, upper);
+
+      // Update
+      lower = upper + 1 + CR;
+    }
+
+    // If this file ended with a newline, we're done
+    if (endsWithNewline)
+      return true;
+
+    // Otherwise, consume one more string, then we're done
+    f(lower, end);
+    return true;
+  }
+
+  static bool read_lines(const char* path, std::vector<std::string>* pContent)
+  {
+    VectorReader reader(pContent);
+    return read_lines(path, reader);
+  }
+
+};
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_MEMORY_MAPPED_READER_H */
diff --git a/inst/include/sourcetools/read/posix/FileConnection.h b/inst/include/sourcetools/read/posix/FileConnection.h
new file mode 100644
index 0000000..eaf5072
--- /dev/null
+++ b/inst/include/sourcetools/read/posix/FileConnection.h
@@ -0,0 +1,58 @@
+#ifndef SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H
+#define SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H
+
+#include <cstddef>
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+namespace sourcetools {
+namespace detail {
+
+class FileConnection
+{
+public:
+
+  typedef int FileDescriptor;
+
+  FileConnection(const char* path, int flags = O_RDONLY)
+  {
+    fd_ = ::open(path, flags);
+  }
+
+  ~FileConnection()
+  {
+    if (open())
+      ::close(fd_);
+  }
+
+  bool open()
+  {
+    return fd_ != -1;
+  }
+
+  bool size(std::size_t* pSize)
+  {
+    struct stat info;
+    if (::fstat(fd_, &info) == -1)
+      return false;
+
+    *pSize = info.st_size;
+    return true;
+  }
+
+  operator FileDescriptor() const
+  {
+    return fd_;
+  }
+
+private:
+  FileDescriptor fd_;
+};
+
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H */
diff --git a/inst/include/sourcetools/read/posix/MemoryMappedConnection.h b/inst/include/sourcetools/read/posix/MemoryMappedConnection.h
new file mode 100644
index 0000000..5782ce2
--- /dev/null
+++ b/inst/include/sourcetools/read/posix/MemoryMappedConnection.h
@@ -0,0 +1,55 @@
+#ifndef SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H
+#define SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H
+
+#include <cstdlib>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#include <sourcetools/platform/platform.h>
+
+namespace sourcetools {
+namespace detail {
+
+class MemoryMappedConnection
+{
+public:
+
+  MemoryMappedConnection(int fd, std::size_t size)
+    : size_(size)
+  {
+#ifdef MAP_POPULATE
+    map_ = (char*) ::mmap(0, size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
+#else
+    map_ = (char*) ::mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
+#endif
+
+#if defined(POSIX_MADV_SEQUENTIAL) && defined(POSIX_MADV_WILLNEED)
+    ::posix_madvise((void*) map_, size, POSIX_MADV_SEQUENTIAL | POSIX_MADV_WILLNEED);
+#endif
+  }
+
+  ~MemoryMappedConnection()
+  {
+    if (map_ != MAP_FAILED)
+      ::munmap(map_, size_);
+  }
+
+  bool open()
+  {
+    return map_ != MAP_FAILED;
+  }
+
+  operator char*() const
+  {
+    return map_;
+  }
+
+private:
+  char* map_;
+  std::size_t size_;
+};
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H */
diff --git a/inst/include/sourcetools/read/read.h b/inst/include/sourcetools/read/read.h
new file mode 100644
index 0000000..e5074f8
--- /dev/null
+++ b/inst/include/sourcetools/read/read.h
@@ -0,0 +1,24 @@
+#ifndef SOURCETOOLS_READ_READ_H
+#define SOURCETOOLS_READ_READ_H
+
+#include <vector>
+#include <string>
+
+#include <sourcetools/read/MemoryMappedReader.h>
+
+namespace sourcetools {
+
+inline bool read(const std::string& absolutePath, std::string* pContent)
+{
+  return detail::MemoryMappedReader::read(absolutePath.c_str(), pContent);
+}
+
+inline bool read_lines(const std::string& absolutePath,
+                       std::vector<std::string>* pLines)
+{
+  return detail::MemoryMappedReader::read_lines(absolutePath.c_str(), pLines);
+}
+
+}  // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_READ_H */
diff --git a/inst/include/sourcetools/read/windows/FileConnection.h b/inst/include/sourcetools/read/windows/FileConnection.h
new file mode 100644
index 0000000..de3c346
--- /dev/null
+++ b/inst/include/sourcetools/read/windows/FileConnection.h
@@ -0,0 +1,50 @@
+#ifndef SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H
+#define SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H
+
+#undef Realloc
+#undef Free
+#include <windows.h>
+
+namespace sourcetools {
+namespace detail {
+
+class FileConnection
+{
+public:
+  typedef HANDLE FileDescriptor;
+
+  FileConnection(const char* path, int flags = GENERIC_READ)
+  {
+    handle_ = ::CreateFile(path, flags, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL);
+  }
+
+  ~FileConnection()
+  {
+    if (open())
+      ::CloseHandle(handle_);
+  }
+
+  bool open()
+  {
+    return handle_ != INVALID_HANDLE_VALUE;
+  }
+
+  bool size(std::size_t* pSize)
+  {
+    *pSize = ::GetFileSize(handle_, NULL);
+    return true;
+  }
+
+  operator FileDescriptor() const
+  {
+    return handle_;
+  }
+
+private:
+  FileDescriptor handle_;
+};
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H */
diff --git a/inst/include/sourcetools/read/windows/MemoryMappedConnection.h b/inst/include/sourcetools/read/windows/MemoryMappedConnection.h
new file mode 100644
index 0000000..0885e3b
--- /dev/null
+++ b/inst/include/sourcetools/read/windows/MemoryMappedConnection.h
@@ -0,0 +1,51 @@
+#ifndef SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H
+#define SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H
+
+#undef Realloc
+#undef Free
+#include <windows.h>
+
+namespace sourcetools {
+namespace detail {
+
+class MemoryMappedConnection
+{
+public:
+
+  MemoryMappedConnection(HANDLE handle, std::size_t size)
+    : map_(NULL), size_(size)
+  {
+    handle_ = ::CreateFileMapping(handle, NULL, PAGE_READONLY, 0, 0, NULL);
+    if (handle_ == NULL)
+      return;
+
+    map_ = (char*) ::MapViewOfFile(handle_, FILE_MAP_READ, 0, 0, size);
+  }
+
+  ~MemoryMappedConnection()
+  {
+    if (handle_ != INVALID_HANDLE_VALUE)
+      ::CloseHandle(handle_);
+  }
+
+  bool open()
+  {
+    return map_ != NULL;
+  }
+
+  operator char*() const
+  {
+    return map_;
+  }
+
+private:
+  char* map_;
+  std::size_t size_;
+
+  HANDLE handle_;
+};
+
+} // namespace detail
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H */
diff --git a/inst/include/sourcetools/tests/testthat.h b/inst/include/sourcetools/tests/testthat.h
new file mode 100644
index 0000000..7ed58f5
--- /dev/null
+++ b/inst/include/sourcetools/tests/testthat.h
@@ -0,0 +1,14 @@
+#ifndef SOURCETOOLS_TESTS_TESTTHAT_H
+#define SOURCETOOLS_TESTS_TESTTHAT_H
+
+// disable testthat with older gcc
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && !defined(__clang__)
+# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)
+#  define TESTTHAT_DISABLED
+# endif
+#endif
+
+// include testthat.h
+#include <testthat.h>
+
+#endif /* SOURCETOOLS_TESTS_TESTTHAT_H */
diff --git a/inst/include/sourcetools/tokenization/Registration.h b/inst/include/sourcetools/tokenization/Registration.h
new file mode 100644
index 0000000..9a44fee
--- /dev/null
+++ b/inst/include/sourcetools/tokenization/Registration.h
@@ -0,0 +1,190 @@
+#ifndef SOURCETOOLS_TOKENIZATION_REGISTRATION_H
+#define SOURCETOOLS_TOKENIZATION_REGISTRATION_H
+
+#include <string>
+#include <cstring>
+#include <cstdlib>
+
+namespace sourcetools {
+namespace tokens {
+
+typedef unsigned int TokenType;
+
+// Simple, non-nestable types.
+#define SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(__NAME__, __TYPE__)         \
+  static const TokenType __NAME__ = __TYPE__
+
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(INVALID,    (1 << 31));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(END,        (1 << 30));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(EMPTY,      (1 << 29));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(MISSING,    (1 << 28));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(ROOT,       (1 << 27));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(SEMI,       (1 << 26));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(COMMA,      (1 << 25));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(SYMBOL,     (1 << 24));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(COMMENT,    (1 << 23));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(WHITESPACE, (1 << 22));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(STRING,     (1 << 21));
+SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(NUMBER,     (1 << 20));
+
+/* Brackets */
+#define SOURCE_TOOLS_BRACKET_BIT        (1 << 19)
+#define SOURCE_TOOLS_BRACKET_RIGHT_BIT  (1 << 5)
+#define SOURCE_TOOLS_BRACKET_LEFT_BIT   (1 << 4)
+#define SOURCE_TOOLS_BRACKET_MASK       SOURCE_TOOLS_BRACKET_BIT
+#define SOURCE_TOOLS_BRACKET_LEFT_MASK  (SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_LEFT_BIT)
+#define SOURCE_TOOLS_BRACKET_RIGHT_MASK (SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT)
+
+#define SOURCE_TOOLS_REGISTER_BRACKET(__NAME__, __SIDE__, __INDEX__)  \
+  static const TokenType __NAME__ =                            \
+    SOURCE_TOOLS_BRACKET_BIT | __SIDE__ | __INDEX__
+
+SOURCE_TOOLS_REGISTER_BRACKET(LPAREN,    SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 0));
+SOURCE_TOOLS_REGISTER_BRACKET(LBRACE,    SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 1));
+SOURCE_TOOLS_REGISTER_BRACKET(LBRACKET,  SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 2));
+SOURCE_TOOLS_REGISTER_BRACKET(LDBRACKET, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 3));
+
+SOURCE_TOOLS_REGISTER_BRACKET(RPAREN,    SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 0));
+SOURCE_TOOLS_REGISTER_BRACKET(RBRACE,    SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 1));
+SOURCE_TOOLS_REGISTER_BRACKET(RBRACKET,  SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 2));
+SOURCE_TOOLS_REGISTER_BRACKET(RDBRACKET, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 3));
+
+/* Operators */
+#define SOURCE_TOOLS_OPERATOR_BIT         (1 << 18)
+#define SOURCE_TOOLS_OPERATOR_UNARY_BIT   (1 << 6)
+#define SOURCE_TOOLS_OPERATOR_MASK        (SOURCE_TOOLS_OPERATOR_BIT)
+#define SOURCE_TOOLS_OPERATOR_UNARY_MASK  (SOURCE_TOOLS_OPERATOR_MASK | SOURCE_TOOLS_OPERATOR_UNARY_BIT)
+
+#define SOURCE_TOOLS_REGISTER_OPERATOR(__NAME__, __STRING__, __MASKS__) \
+                                                                        \
+  static const TokenType OPERATOR_ ## __NAME__ =                        \
+    SOURCE_TOOLS_OPERATOR_BIT | __MASKS__;                              \
+                                                                        \
+  static const char* const                                              \
+    OPERATOR_ ## __NAME__ ## _STRING = __STRING__
+
+#define SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(__NAME__, __STRING__, __INDEX__)    \
+  SOURCE_TOOLS_REGISTER_OPERATOR(__NAME__, __STRING__, SOURCE_TOOLS_OPERATOR_UNARY_BIT | __INDEX__)
+
+// See ?"Syntax" for details on R's operators.
+// Note: All operators registered work in a binary context, but only
+// some will work as unary operators. (Occurring to the left of the token).
+//
+// In other words, -1 is parsed as `-`(1).
+//
+// Note that although brackets are operators we tokenize them separately,
+// since we need to later check for their paired complement.
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(PLUS,          "+",    0);
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(MINUS,         "-",    1);
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(HELP,          "?",    2);
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(NEGATION,      "!",    3);
+SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(FORMULA,       "~",    4);
+
+SOURCE_TOOLS_REGISTER_OPERATOR(NAMESPACE_EXPORTS,   "::",   5);
+SOURCE_TOOLS_REGISTER_OPERATOR(NAMESPACE_ALL,       ":::",  6);
+SOURCE_TOOLS_REGISTER_OPERATOR(DOLLAR,              "$",    7);
+SOURCE_TOOLS_REGISTER_OPERATOR(AT,                  "@",    8);
+SOURCE_TOOLS_REGISTER_OPERATOR(HAT,                 "^",    9);
+SOURCE_TOOLS_REGISTER_OPERATOR(EXPONENTATION_STARS, "**",  10);
+SOURCE_TOOLS_REGISTER_OPERATOR(SEQUENCE,            ":",   11);
+SOURCE_TOOLS_REGISTER_OPERATOR(MULTIPLY,            "*",   12);
+SOURCE_TOOLS_REGISTER_OPERATOR(DIVIDE,              "/",   13);
+SOURCE_TOOLS_REGISTER_OPERATOR(LESS,                "<",   14);
+SOURCE_TOOLS_REGISTER_OPERATOR(LESS_OR_EQUAL,       "<=",  15);
+SOURCE_TOOLS_REGISTER_OPERATOR(GREATER,             ">",   16);
+SOURCE_TOOLS_REGISTER_OPERATOR(GREATER_OR_EQUAL,    ">=",  17);
+SOURCE_TOOLS_REGISTER_OPERATOR(EQUAL,               "==",  18);
+SOURCE_TOOLS_REGISTER_OPERATOR(NOT_EQUAL,           "!=",  19);
+SOURCE_TOOLS_REGISTER_OPERATOR(AND_VECTOR,          "&",   20);
+SOURCE_TOOLS_REGISTER_OPERATOR(AND_SCALAR,          "&&",  21);
+SOURCE_TOOLS_REGISTER_OPERATOR(OR_VECTOR,           "|",   22);
+SOURCE_TOOLS_REGISTER_OPERATOR(OR_SCALAR,           "||",  23);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT,         "<-",  24);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_PARENT,  "<<-", 25);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_RIGHT,        "->",  26);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_RIGHT_PARENT, "->>", 27);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_EQUALS,  "=",   28);
+SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_COLON,   ":=",  29);
+SOURCE_TOOLS_REGISTER_OPERATOR(USER,                "%%",  30);
+
+/* Keywords and symbols */
+#define SOURCE_TOOLS_KEYWORD_BIT               (1 << 17)
+#define SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_BIT  (1 << 7)
+#define SOURCE_TOOLS_KEYWORD_MASK              SOURCE_TOOLS_KEYWORD_BIT
+#define SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK (SOURCE_TOOLS_KEYWORD_MASK | SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_BIT)
+
+#define SOURCE_TOOLS_REGISTER_KEYWORD(__NAME__, __MASKS__)            \
+  static const TokenType KEYWORD_ ## __NAME__ =                \
+    __MASKS__ | SOURCE_TOOLS_KEYWORD_MASK
+
+#define SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(__NAME__, __MASKS__) \
+  SOURCE_TOOLS_REGISTER_KEYWORD(__NAME__, __MASKS__ | SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK)
+
+// See '?Reserved' for a list of reversed R symbols.
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(IF,       1);
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(FOR,      2);
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(WHILE,    3);
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(REPEAT,   4);
+SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(FUNCTION, 5);
+
+SOURCE_TOOLS_REGISTER_KEYWORD(ELSE,                  6);
+SOURCE_TOOLS_REGISTER_KEYWORD(IN,                    7);
+SOURCE_TOOLS_REGISTER_KEYWORD(NEXT,                  8);
+SOURCE_TOOLS_REGISTER_KEYWORD(BREAK,                 9);
+SOURCE_TOOLS_REGISTER_KEYWORD(TRUE,                 10);
+SOURCE_TOOLS_REGISTER_KEYWORD(FALSE,                11);
+SOURCE_TOOLS_REGISTER_KEYWORD(NULL,                 12);
+SOURCE_TOOLS_REGISTER_KEYWORD(Inf,                  13);
+SOURCE_TOOLS_REGISTER_KEYWORD(NaN,                  14);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA,                   15);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA_integer_,          16);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA_real_,             17);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA_complex_,          18);
+SOURCE_TOOLS_REGISTER_KEYWORD(NA_character_,        19);
+
+inline TokenType symbolType(const char* string, std::size_t n)
+{
+  // TODO: Is this insanity really an optimization or am I just silly?
+  if (n < 2 || n > 13) {
+    return SYMBOL;
+  } else if (n == 2) {
+    if (!std::memcmp(string, "in", n)) return KEYWORD_IN;
+    if (!std::memcmp(string, "if", n)) return KEYWORD_IF;
+    if (!std::memcmp(string, "NA", n)) return KEYWORD_NA;
+  } else if (n == 3) {
+    if (!std::memcmp(string, "for", n)) return KEYWORD_FOR;
+    if (!std::memcmp(string, "Inf", n)) return KEYWORD_Inf;
+    if (!std::memcmp(string, "NaN", n)) return KEYWORD_NaN;
+  } else if (n == 4) {
+    if (!std::memcmp(string, "else", n)) return KEYWORD_ELSE;
+    if (!std::memcmp(string, "next", n)) return KEYWORD_NEXT;
+    if (!std::memcmp(string, "TRUE", n)) return KEYWORD_TRUE;
+    if (!std::memcmp(string, "NULL", n)) return KEYWORD_NULL;
+  } else if (n == 5) {
+    if (!std::memcmp(string, "while", n)) return KEYWORD_WHILE;
+    if (!std::memcmp(string, "break", n)) return KEYWORD_BREAK;
+    if (!std::memcmp(string, "FALSE", n)) return KEYWORD_FALSE;
+  } else if (n == 6) {
+    if (!std::memcmp(string, "repeat", n)) return KEYWORD_REPEAT;
+  } else if (n == 8) {
+    if (!std::memcmp(string, "function", n)) return KEYWORD_FUNCTION;
+    if (!std::memcmp(string, "NA_real_", n)) return KEYWORD_NA_real_;
+  } else if (n == 11) {
+    if (!std::memcmp(string, "NA_integer_", n)) return KEYWORD_NA_integer_;
+    if (!std::memcmp(string, "NA_complex_", n)) return KEYWORD_NA_complex_;
+  } else if (n == 13) {
+    if (!std::memcmp(string, "NA_character_", n)) return KEYWORD_NA_character_;
+  }
+
+  return SYMBOL;
+}
+
+inline TokenType symbolType(const std::string& symbol)
+{
+  return symbolType(symbol.data(), symbol.size());
+}
+
+} // namespace tokens
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_TOKENIZATION_REGISTRATION_H */
diff --git a/inst/include/sourcetools/tokenization/Token.h b/inst/include/sourcetools/tokenization/Token.h
new file mode 100644
index 0000000..fff05f3
--- /dev/null
+++ b/inst/include/sourcetools/tokenization/Token.h
@@ -0,0 +1,522 @@
+#ifndef SOURCETOOLS_TOKENIZATION_TOKEN_H
+#define SOURCETOOLS_TOKENIZATION_TOKEN_H
+
+#include <cstring>
+#include <cstdio>
+
+#include <vector>
+#include <string>
+#include <map>
+#include <sstream>
+
+#include <sourcetools/core/core.h>
+#include <sourcetools/tokenization/Registration.h>
+#include <sourcetools/collection/Position.h>
+#include <sourcetools/cursor/TextCursor.h>
+
+namespace sourcetools {
+namespace tokens {
+
+class Token
+{
+private:
+  typedef cursors::TextCursor TextCursor;
+  typedef collections::Position Position;
+
+public:
+
+  Token()
+    : begin_(NULL),
+      end_(NULL),
+      offset_(0),
+      type_(INVALID)
+  {
+  }
+
+  explicit Token(TokenType type)
+    : begin_(NULL),
+      end_(NULL),
+      offset_(0),
+      type_(type)
+  {
+  }
+
+  Token(const Position& position)
+    : begin_(NULL),
+      end_(NULL),
+      offset_(0),
+      position_(position),
+      type_(INVALID)
+  {
+  }
+
+  Token(const TextCursor& cursor, TokenType type, std::size_t length)
+    : begin_(cursor.begin() + cursor.offset()),
+      end_(cursor.begin() + cursor.offset() + length),
+      offset_(cursor.offset()),
+      position_(cursor.position()),
+      type_(type)
+  {
+  }
+
+  const char* begin() const { return begin_; }
+  const char* end() const { return end_; }
+  std::size_t offset() const { return offset_; }
+  std::size_t size() const { return end_ - begin_; }
+
+  std::string contents() const
+  {
+    return std::string(begin_, end_);
+  }
+
+  bool contentsEqual(const char* string)
+  {
+    return std::strcmp(begin_, string);
+  }
+
+  bool contentsEqual(const std::string& string) const
+  {
+    if (string.size() != size())
+      return false;
+
+    return std::memcmp(begin_, string.c_str(), size()) == 0;
+  }
+
+  const Position& position() const { return position_; }
+  std::size_t row() const { return position_.row; }
+  std::size_t column() const { return position_.column; }
+
+  TokenType type() const { return type_; }
+  bool isType(TokenType type) const { return type_ == type; }
+
+private:
+  const char* begin_;
+  const char* end_;
+  std::size_t offset_;
+
+  Position position_;
+  TokenType type_;
+};
+
+inline bool isBracket(const Token& token)
+{
+  return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_MASK);
+}
+
+inline bool isLeftBracket(const Token& token)
+{
+  return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_LEFT_MASK);
+}
+
+inline bool isRightBracket(const Token& token)
+{
+  return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_RIGHT_MASK);
+}
+
+inline bool isComplement(TokenType lhs, TokenType rhs)
+{
+  static const TokenType mask =
+    SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_LEFT_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT;
+
+  if (SOURCE_TOOLS_CHECK_MASK((lhs | rhs), mask))
+    return SOURCE_TOOLS_LOWER_BITS(lhs, 4) == SOURCE_TOOLS_LOWER_BITS(rhs, 4);
+
+  return false;
+}
+
+inline TokenType complement(TokenType type)
+{
+  static const TokenType mask =
+    SOURCE_TOOLS_BRACKET_LEFT_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT;
+
+  return type ^ mask;
+}
+
+inline bool isKeyword(const Token& token)
+{
+  return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_KEYWORD_MASK);
+}
+
+inline bool isControlFlowKeyword(const Token& token)
+{
+  return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK);
+}
+
+inline bool isOperator(const Token& token)
+{
+  return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_OPERATOR_MASK);
+}
+
+inline bool isUnaryOperator(const Token& token)
+{
+  return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_OPERATOR_UNARY_MASK);
+}
+
+inline bool isNonUnaryOperator(const Token& token)
+{
+  return isOperator(token) && !isUnaryOperator(token);
+}
+
+inline bool isComparisonOperator(const Token& token)
+{
+  switch (token.type())
+  {
+  case OPERATOR_AND_SCALAR:
+  case OPERATOR_AND_VECTOR:
+  case OPERATOR_OR_SCALAR:
+  case OPERATOR_OR_VECTOR:
+  case OPERATOR_EQUAL:
+  case OPERATOR_NOT_EQUAL:
+  case OPERATOR_LESS:
+  case OPERATOR_LESS_OR_EQUAL:
+  case OPERATOR_GREATER:
+  case OPERATOR_GREATER_OR_EQUAL:
+    return true;
+  default:
+    return false;
+  }
+}
+
+inline bool isWhitespace(const Token& token)
+{
+  return token.type() == WHITESPACE;
+}
+
+inline bool isComment(const Token& token)
+{
+  return token.type() == COMMENT;
+}
+
+inline bool isSymbol(const Token& token)
+{
+  return token.type() == SYMBOL;
+}
+
+inline bool isEnd(const Token& token)
+{
+  return token.type() == END;
+}
+
+inline bool isString(const Token& token)
+{
+  return token.type() == STRING;
+}
+
+inline bool isSymbolic(const Token& token)
+{
+  static const TokenType mask = SYMBOL | NUMBER | STRING;
+  return (token.type() & mask) != 0;
+}
+
+inline bool isNumeric(const Token& token)
+{
+  return (token.type() & NUMBER) != 0;
+}
+
+inline bool isCallOperator(const Token& token)
+{
+  return token.type() == LPAREN ||
+         token.type() == LBRACKET ||
+         token.type() == LDBRACKET;
+}
+
+inline bool isAssignmentOperator(const Token& token)
+{
+  switch (token.type())
+  {
+  case OPERATOR_ASSIGN_LEFT:
+  case OPERATOR_ASSIGN_LEFT_COLON:
+  case OPERATOR_ASSIGN_LEFT_EQUALS:
+  case OPERATOR_ASSIGN_LEFT_PARENT:
+  case OPERATOR_ASSIGN_RIGHT:
+  case OPERATOR_ASSIGN_RIGHT_PARENT:
+    return true;
+  default:
+    return false;
+  }
+}
+
+namespace detail {
+
+inline bool isHexDigit(char c)
+{
+  if (c >= '0' && c <= '9')
+    return true;
+  else if (c >= 'a' && c <= 'f')
+    return true;
+  else if (c >= 'A' && c <= 'F')
+    return true;
+  return false;
+}
+
+inline int hexValue(char c)
+{
+  if (c >= '0' && c <= '9')
+    return c - '0';
+  else if (c >= 'a' && c <= 'f')
+    return c - 'a' + 10;
+  else if (c >= 'A' && c <= 'F')
+    return c - 'A' + 10;
+
+  return 0;
+}
+
+// Parses an octal escape sequence, e.g. '\012'.
+inline bool parseOctal(const char*& it, char*& output)
+{
+  // Check for opening escape
+  if (*it != '\\')
+    return false;
+
+  // Check for number following
+  char lookahead = *(it + 1);
+  if (lookahead < '0' || lookahead > '7')
+    return false;
+  ++it;
+
+  // Begin parsing. Consume up to three numbers.
+  unsigned char result = 0;
+  const char* end = it + 3;
+  for (; it != end; ++it)
+  {
+    char ch = *it;
+    if ('0' <= ch && ch <= '7')
+      result = 8 * result + ch - '0';
+    else
+      break;
+  }
+
+  // Assign result, and return.
+  *output++ = result;
+  return true;
+}
+
+// Parse a hex escape sequence, e.g. '\xFF'.
+inline bool parseHex(const char*& it, char*& output)
+{
+  // Check for opening escape.
+  if (*it != '\\')
+    return false;
+
+  if (*(it + 1) != 'x')
+    return false;
+
+  if (!isHexDigit(*(it + 2)))
+    return false;
+
+  // Begin parsing.
+  it += 2;
+  unsigned char value = 0;
+  const char* end = it + 2;
+  for (; it != end; ++it)
+  {
+    int result = hexValue(*it);
+    if (result == 0)
+      break;
+    value = 16 * value + result;
+  }
+
+  *output++ = value;
+  return true;
+}
+
+// Parse a unicode escape sequence.
+inline bool parseUnicode(const char*& it, char*& output)
+{
+  if (*it != '\\')
+    return false;
+
+  char lookahead = *(it + 1);
+  int size;
+  if (lookahead == 'u')
+    size = 4;
+  else if (lookahead == 'U')
+    size = 8;
+  else
+    return false;
+
+  // Clone the input iterator (only set it on success)
+  const char* clone = it;
+  clone += 2;
+
+  // Check for e.g. '\u{...}'
+  //                   ^
+  bool delimited = *clone == '{';
+  clone += delimited;
+
+  // Check for a hex digit.
+  if (!isHexDigit(*clone))
+    return false;
+
+  // Begin parsing hex digits
+  wchar_t value = 0;
+  const char* end = clone + size;
+  for (; clone != end; ++clone)
+  {
+    if (!isHexDigit(*clone))
+      break;
+
+    int hex = hexValue(*clone);
+    value = 16 * value + hex;
+  }
+
+  // Eat a closing '}' if we had a starting '{'.
+  if (delimited)
+  {
+    if (*clone != '}')
+      return false;
+    ++clone;
+  }
+
+  std::mbstate_t state;
+  std::memset(&state, 0, sizeof(state));
+  std::size_t bytes = std::wcrtomb(output, value, &state);
+  if (bytes == static_cast<std::size_t>(-1))
+    return false;
+
+  // Update iterator state
+  it = clone;
+  output += bytes;
+  return true;
+}
+
+} // namespace detail
+
+inline std::string stringValue(const char* begin, const char* end)
+{
+  if (begin == end)
+    return std::string();
+
+  std::size_t n = end - begin;
+  scoped_array<char> buffer(new char[n + 1]);
+
+  const char* it = begin;
+  char* output = buffer;
+
+  while (it < end)
+  {
+    if (*it == '\\')
+    {
+      if (detail::parseOctal(it, output) ||
+          detail::parseHex(it, output) ||
+          detail::parseUnicode(it, output))
+      {
+        continue;
+      }
+
+      // Handle the rest
+      ++it;
+      switch (*it)
+      {
+      case 'a':  *output++ = '\a'; break;
+      case 'b':  *output++ = '\b'; break;
+      case 'f':  *output++ = '\f'; break;
+      case 'n':  *output++ = '\n'; break;
+      case 'r':  *output++ = '\r'; break;
+      case 't':  *output++ = '\t'; break;
+      case 'v':  *output++ = '\v'; break;
+      case '\\': *output++ = '\\'; break;
+      default:   *output++ = *it;  break;
+      }
+      ++it;
+    }
+    else
+    {
+      *output++ = *it++;
+    }
+  }
+
+  // Ensure null termination, just in case
+  *output++ = '\0';
+
+  // Construct the result string and return
+  std::string result(buffer, output - buffer);
+  return result;
+}
+
+inline std::string stringValue(const Token& token)
+{
+  switch (token.type())
+  {
+  case STRING:
+    return stringValue(token.begin() + 1, token.end() - 1);
+  case SYMBOL:
+    if (*token.begin() == '`')
+      return stringValue(token.begin() + 1, token.end() - 1);
+  default:
+    return stringValue(token.begin(), token.end());
+  }
+}
+
+} // namespace tokens
+
+inline std::string toString(tokens::TokenType type)
+{
+  using namespace tokens;
+
+       if (type == INVALID)    return "invalid";
+  else if (type == END)        return "end";
+  else if (type == EMPTY)      return "empty";
+  else if (type == MISSING)    return "missing";
+  else if (type == SEMI)       return "semi";
+  else if (type == COMMA)      return "comma";
+  else if (type == SYMBOL)     return "symbol";
+  else if (type == COMMENT)    return "comment";
+  else if (type == WHITESPACE) return "whitespace";
+  else if (type == STRING)     return "string";
+  else if (type == NUMBER)     return "number";
+
+  else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_BRACKET_MASK))
+    return "bracket";
+  else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_KEYWORD_MASK))
+    return "keyword";
+  else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_OPERATOR_MASK))
+    return "operator";
+
+  return "unknown";
+}
+
+inline std::string toString(const tokens::Token& token)
+{
+  std::string contents;
+  if (token.isType(tokens::END))
+    contents = "<END>";
+  else if (token.isType(tokens::EMPTY))
+    contents = "<empty>";
+  else if (token.isType(tokens::MISSING))
+    contents = "<missing>";
+  else
+    contents = token.contents();
+
+  static const int N = 1024;
+  if (contents.size() > N / 2)
+    contents = contents.substr(0, N / 2);
+  char buff[N];
+  std::sprintf(buff,
+               "[%4lu:%4lu]: %s",
+               static_cast<unsigned long>(token.row()),
+               static_cast<unsigned long>(token.column()),
+               contents.c_str());
+  return buff;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const tokens::Token& token)
+{
+  return os << toString(token);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<tokens::Token>& tokens)
+{
+  for (std::vector<tokens::Token>::const_iterator it = tokens.begin();
+       it != tokens.end();
+       ++it)
+  {
+    os << *it << std::endl;
+  }
+
+  return os;
+}
+
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_TOKENIZATION_TOKEN_H */
diff --git a/inst/include/sourcetools/tokenization/Tokenizer.h b/inst/include/sourcetools/tokenization/Tokenizer.h
new file mode 100644
index 0000000..3f601dd
--- /dev/null
+++ b/inst/include/sourcetools/tokenization/Tokenizer.h
@@ -0,0 +1,463 @@
+#ifndef SOURCETOOLS_TOKENIZATION_TOKENIZER_H
+#define SOURCETOOLS_TOKENIZATION_TOKENIZER_H
+
+#include <sourcetools/core/core.h>
+#include <sourcetools/tokenization/Token.h>
+#include <sourcetools/cursor/TextCursor.h>
+
+#include <vector>
+#include <stack>
+#include <sstream>
+
+namespace sourcetools {
+namespace tokenizer {
+
+class Tokenizer
+{
+private:
+  typedef tokens::Token Token;
+  typedef cursors::TextCursor TextCursor;
+  typedef tokens::TokenType TokenType;
+
+private:
+
+  // Tokenization ----
+
+  void consumeToken(TokenType type,
+                    std::size_t length,
+                    Token* pToken)
+  {
+    *pToken = Token(cursor_, type, length);
+    cursor_.advance(length);
+  }
+
+  template <bool SkipEscaped, bool InvalidOnError>
+  void consumeUntil(char ch,
+                    TokenType type,
+                    Token* pToken)
+  {
+    TextCursor lookahead = cursor_;
+
+    bool success = false;
+    std::size_t distance = 0;
+
+    while (lookahead != lookahead.end()) {
+      lookahead.advance();
+      ++distance;
+
+      if (SkipEscaped && lookahead.peek() == '\\') {
+        lookahead.advance();
+        ++distance;
+        continue;
+      }
+
+      if (lookahead.peek() == ch) {
+        success = true;
+        break;
+      }
+    }
+
+    if (success) {
+      consumeToken(type, distance + 1, pToken);
+    } else {
+      consumeToken(
+        InvalidOnError ? tokens::INVALID : type,
+        distance,
+        pToken
+      );
+    }
+  }
+
+  void consumeUserOperator(Token* pToken)
+  {
+    consumeUntil<false, true>('%', tokens::OPERATOR_USER, pToken);
+  }
+
+  void consumeComment(Token* pToken)
+  {
+    consumeUntil<false, false>('\n', tokens::COMMENT, pToken);
+  }
+
+  void consumeQuotedSymbol(Token* pToken)
+  {
+    consumeUntil<true, true>('`', tokens::SYMBOL, pToken);
+  }
+
+  void consumeQString(Token* pToken)
+  {
+    consumeUntil<true, true>('\'', tokens::STRING, pToken);
+  }
+
+  void consumeQQString(Token* pToken)
+  {
+    consumeUntil<true, true>('"', tokens::STRING, pToken);
+  }
+
+  // NOTE: Don't tokenize '-' or '+' as part of number; instead
+  // it's parsed as a unary operator.
+  bool isStartOfNumber()
+  {
+    char ch = cursor_.peek();
+    if (utils::isDigit(ch))
+      return true;
+    if (ch == '.')
+      return utils::isDigit(cursor_.peek(1));
+    return false;
+  }
+
+  bool isStartOfSymbol()
+  {
+    return utils::isValidForStartOfRSymbol(cursor_.peek());
+  }
+
+  bool consumeHexadecimalNumber(Token* pToken)
+  {
+    std::size_t distance = 0;
+
+    // Detect the leading '0'.
+    if (cursor_.peek(distance) != '0')
+      return false;
+    ++distance;
+
+    // Detect a 'x' or 'X'.
+    if (!(cursor_.peek(distance) == 'x' || cursor_.peek(distance) == 'X'))
+      return false;
+    ++distance;
+
+    // Check and consume all alphanumeric characters.
+    // The number is valid if the characters are valid
+    // hexadecimal characters (0-9, a-f, A-F). The number
+    // can also end with an 'i' (for an imaginary number)
+    // or with an 'L' for an integer.
+    if (!utils::isHexDigit(cursor_.peek(distance)))
+    {
+      consumeToken(tokens::INVALID, distance, pToken);
+      return false;
+    }
+
+    bool success = true;
+    char peek = cursor_.peek(distance);
+    while (utils::isAlphaNumeric(peek) && peek != '\0') {
+
+      // If we encounter an 'i' or an 'L', assume
+      // that this ends the identifier.
+      if (peek == 'i' || peek == 'L')
+      {
+        ++distance;
+        break;
+      }
+
+      if (!utils::isHexDigit(peek))
+        success = false;
+
+      ++distance;
+      peek = cursor_.peek(distance);
+    }
+
+    consumeToken(success ? tokens::NUMBER : tokens::INVALID, distance, pToken);
+    return true;
+  }
+
+  void consumeNumber(Token* pToken)
+  {
+    bool success = true;
+    std::size_t distance = 0;
+
+    // NOTE: A leading '-' or '+' is not consumed as part of
+    // the number.
+
+    // Try parsing this as a hexadecimal number first (e.g. '0xabc').
+    if (consumeHexadecimalNumber(pToken))
+      return;
+
+    // Consume digits
+    while (utils::isDigit(cursor_.peek(distance)))
+      ++distance;
+
+    // Consume a dot for decimals
+    // Note: '.5' is a valid specification for a number
+    // So is '100.'; ie, with a trailing decimal.
+    if (cursor_.peek(distance) == '.') {
+      ++distance;
+      while (utils::isDigit(cursor_.peek(distance)))
+        ++distance;
+    }
+
+    // Consume 'e', 'E' for exponential notation
+    if (cursor_.peek(distance) == 'e' || cursor_.peek(distance) == 'E') {
+      ++distance;
+
+      // Consume a '-' or a '+' for a negative number
+      if (cursor_.peek(distance) == '-' || cursor_.peek(distance) == '+')
+        ++distance;
+
+      // Parse another set of numbers following the E
+      success = utils::isDigit(cursor_.peek(distance));
+      while (utils::isDigit(cursor_.peek(distance)))
+        ++distance;
+
+      // Consume '.' and following numbers. Note that this is
+      // not really a valid number for R but it's better to tokenize
+      // this is a single entity (and then report failure later)
+      if (cursor_.peek(distance) == '.') {
+        success = false;
+        ++distance;
+        while (utils::isDigit(cursor_.peek(distance)))
+          ++distance;
+      }
+    }
+
+    // Consume a final 'L' for integer literals
+    if (cursor_.peek(distance) == 'L')
+      ++distance;
+
+    consumeToken(success ? tokens::NUMBER : tokens::INVALID, distance, pToken);
+  }
+
+  void consumeSymbol(Token* pToken)
+  {
+    std::size_t distance = 1;
+    char ch = cursor_.peek(distance);
+    while (utils::isValidForRSymbol(ch)) {
+      ++distance;
+      ch = cursor_.peek(distance);
+    }
+
+    const char* ptr = &*(cursor_.begin() + cursor_.offset());
+    consumeToken(tokens::symbolType(ptr, distance), distance, pToken);
+  }
+
+public:
+
+  Tokenizer(const char* code, std::size_t n)
+    : cursor_(code, n)
+  {
+  }
+
+  bool tokenize(Token* pToken)
+  {
+    if (cursor_ >= cursor_.end())
+    {
+      *pToken = Token(tokens::END);
+      return false;
+    }
+
+    char ch = cursor_.peek();
+    int n = 0;
+
+    // Block-related tokens
+    if (ch == '{')
+      consumeToken(tokens::LBRACE, 1, pToken);
+    else if (ch == '}')
+      consumeToken(tokens::RBRACE, 1, pToken);
+    else if (ch == '(')
+      consumeToken(tokens::LPAREN, 1, pToken);
+    else if (ch == ')')
+      consumeToken(tokens::RPAREN, 1, pToken);
+    else if (ch == '[') {
+      if (cursor_.peek(1) == '[') {
+        tokenStack_.push(tokens::LDBRACKET);
+        consumeToken(tokens::LDBRACKET, 2, pToken);
+      } else {
+        tokenStack_.push(tokens::LBRACKET);
+        consumeToken(tokens::LBRACKET, 1, pToken);
+      }
+    } else if (ch == ']') {
+      if (tokenStack_.empty()) {
+        consumeToken(tokens::INVALID, 1, pToken);
+      } else if (tokenStack_.top() == tokens::LDBRACKET) {
+        tokenStack_.pop();
+        if (cursor_.peek(1) == ']')
+          consumeToken(tokens::RDBRACKET, 2, pToken);
+        else
+          consumeToken(tokens::INVALID, 1, pToken);
+      } else {
+        tokenStack_.pop();
+        consumeToken(tokens::RBRACKET, 1, pToken);
+      }
+    }
+
+    // Operators
+    else if (ch == '<')  // <<-, <=, <-, <
+    {
+      char next = cursor_.peek(1);
+      if (next == '-') // <-
+        consumeToken(tokens::OPERATOR_ASSIGN_LEFT, 2, pToken);
+      else if (next == '=') // <=
+        consumeToken(tokens::OPERATOR_LESS_OR_EQUAL, 2, pToken);
+      else if (next == '<' && cursor_.peek(2) == '-')
+        consumeToken(tokens::OPERATOR_ASSIGN_LEFT_PARENT, 3, pToken);
+      else
+        consumeToken(tokens::OPERATOR_LESS, 1, pToken);
+    }
+
+    else if (ch == '>')  // >=, >
+    {
+      if (cursor_.peek(1) == '=')
+        consumeToken(tokens::OPERATOR_GREATER_OR_EQUAL, 2, pToken);
+      else
+        consumeToken(tokens::OPERATOR_GREATER, 1, pToken);
+    }
+    else if (ch == '=')  // '==', '='
+    {
+      if (cursor_.peek(1) == '=')
+        consumeToken(tokens::OPERATOR_EQUAL, 2, pToken);
+      else
+        consumeToken(tokens::OPERATOR_ASSIGN_LEFT_EQUALS, 1, pToken);
+    }
+    else if (ch == '|')  // '||', '|'
+    {
+      if (cursor_.peek(1) == '|')
+        consumeToken(tokens::OPERATOR_OR_SCALAR, 2, pToken);
+      else
+        consumeToken(tokens::OPERATOR_OR_VECTOR, 1, pToken);
+    }
+    else if (ch == '&')  // '&&', '&'
+    {
+      if (cursor_.peek(1) == '&')
+        consumeToken(tokens::OPERATOR_AND_SCALAR, 2, pToken);
+      else
+        consumeToken(tokens::OPERATOR_AND_VECTOR, 1, pToken);
+    }
+    else if (ch == '*')  // **, *
+    {
+      if (cursor_.peek(1) == '*')
+        consumeToken(tokens::OPERATOR_EXPONENTATION_STARS, 2, pToken);
+      else
+        consumeToken(tokens::OPERATOR_MULTIPLY, 1, pToken);
+    }
+    else if (ch == ':')  // ':::', '::', ':=', ':'
+    {
+      if (cursor_.peek(1) == ':')
+      {
+        if (cursor_.peek(2) == ':')
+          consumeToken(tokens::OPERATOR_NAMESPACE_ALL, 3, pToken);
+        else
+          consumeToken(tokens::OPERATOR_NAMESPACE_EXPORTS, 2, pToken);
+      }
+      else if (cursor_.peek(1) == '=')
+        consumeToken(tokens::OPERATOR_ASSIGN_LEFT_COLON, 2, pToken);
+      else
+        consumeToken(tokens::OPERATOR_SEQUENCE, 1, pToken);
+    }
+    else if (ch == '!')
+    {
+      if (cursor_.peek(1) == '=')
+        consumeToken(tokens::OPERATOR_NOT_EQUAL, 2, pToken);
+      else
+        consumeToken(tokens::OPERATOR_NEGATION, 1, pToken);
+    }
+    else if (ch == '-') // '->>', '->', '-'
+    {
+      if (cursor_.peek(1) == '>')
+      {
+        if (cursor_.peek(2) == '>')
+          consumeToken(tokens::OPERATOR_ASSIGN_RIGHT_PARENT, 3, pToken);
+        else
+          consumeToken(tokens::OPERATOR_ASSIGN_RIGHT, 2, pToken);
+      }
+      else
+        consumeToken(tokens::OPERATOR_MINUS, 1, pToken);
+    }
+    else if (ch == '+')
+      consumeToken(tokens::OPERATOR_PLUS, 1, pToken);
+    else if (ch == '~')
+      consumeToken(tokens::OPERATOR_FORMULA, 1, pToken);
+    else if (ch == '?')
+      consumeToken(tokens::OPERATOR_HELP, 1, pToken);
+    else if (ch == '/')
+      consumeToken(tokens::OPERATOR_DIVIDE, 1, pToken);
+    else if (ch == '@')
+      consumeToken(tokens::OPERATOR_AT, 1, pToken);
+    else if (ch == '$')
+      consumeToken(tokens::OPERATOR_DOLLAR, 1, pToken);
+    else if (ch == '^')
+      consumeToken(tokens::OPERATOR_HAT, 1, pToken);
+
+    // User operators
+    else if (ch == '%')
+      consumeUserOperator(pToken);
+
+    // Punctuation-related tokens
+    else if (ch == ',')
+      consumeToken(tokens::COMMA, 1, pToken);
+    else if (ch == ';')
+      consumeToken(tokens::SEMI, 1, pToken);
+
+    // Whitespace
+    else if (utils::countWhitespaceBytes(cursor_, &n))
+      consumeToken(tokens::WHITESPACE, n, pToken);
+
+    // Strings and symbols
+    else if (ch == '\'')
+      consumeQString(pToken);
+    else if (ch == '"')
+      consumeQQString(pToken);
+    else if (ch == '`')
+      consumeQuotedSymbol(pToken);
+
+    // Comments
+    else if (ch == '#')
+      consumeComment(pToken);
+
+    // Number
+    else if (isStartOfNumber())
+      consumeNumber(pToken);
+
+    // Symbol
+    else if (isStartOfSymbol())
+      consumeSymbol(pToken);
+
+    // Nothing matched -- error
+    else
+      consumeToken(tokens::INVALID, 1, pToken);
+
+    return true;
+  }
+
+  Token peek(std::size_t lookahead = 1)
+  {
+    Tokenizer clone(*this);
+
+    Token result(tokens::END);
+    for (std::size_t i = 0; i < lookahead; ++i) {
+      if (!clone.tokenize(&result)) {
+        break;
+      }
+    }
+
+    return result;
+  }
+
+private:
+  TextCursor cursor_;
+  std::stack<TokenType, std::vector<TokenType> > tokenStack_;
+};
+
+} // namespace tokenizer
+
+inline std::vector<tokens::Token> tokenize(const char* code, std::size_t n)
+{
+  typedef tokenizer::Tokenizer Tokenizer;
+  typedef tokens::Token Token;
+
+  std::vector<Token> tokens;
+  if (n == 0)
+    return tokens;
+
+  Token token;
+  Tokenizer tokenizer(code, n);
+  while (tokenizer.tokenize(&token))
+    tokens.push_back(token);
+
+  return tokens;
+}
+
+inline std::vector<tokens::Token> tokenize(const std::string& code)
+{
+  return tokenize(code.data(), code.size());
+}
+
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_TOKENIZATION_TOKENIZER_H */
diff --git a/inst/include/sourcetools/tokenization/tokenization.h b/inst/include/sourcetools/tokenization/tokenization.h
new file mode 100644
index 0000000..e1dee85
--- /dev/null
+++ b/inst/include/sourcetools/tokenization/tokenization.h
@@ -0,0 +1,8 @@
+#ifndef SOURCETOOLS_TOKENIZATION_TOKENIZATION_H
+#define SOURCETOOLS_TOKENIZATION_TOKENIZATION_H
+
+#include <sourcetools/tokenization/Registration.h>
+#include <sourcetools/tokenization/Token.h>
+#include <sourcetools/tokenization/Tokenizer.h>
+
+#endif /* SOURCETOOLS_TOKENIZATION_TOKENIZATION_H */
diff --git a/inst/include/sourcetools/utf8/utf8.h b/inst/include/sourcetools/utf8/utf8.h
new file mode 100644
index 0000000..96e0c5e
--- /dev/null
+++ b/inst/include/sourcetools/utf8/utf8.h
@@ -0,0 +1,115 @@
+#ifndef SOURCETOOLS_UTF8_UTF8_H
+#define SOURCETOOLS_UTF8_UTF8_H
+
+#include <cstddef>
+
+#include <sourcetools/core/core.h>
+
+namespace sourcetools {
+namespace utf8 {
+
+namespace detail {
+static const unsigned char mask[] = {
+  0,    // 00000000
+  0x7F, // 01111111
+  0x1F, // 00011111
+  0x0F, // 00001111
+  0x07, // 00000111
+  0x03, // 00000011
+  0x01  // 00000001
+};
+} // namespace detail
+
+class iterator
+{
+public:
+  iterator(const char* data)
+    : data_(reinterpret_cast<const unsigned char*>(data)),
+      offset_(0)
+  {
+  }
+
+  iterator(const iterator& other)
+    : data_(other.data_),
+      offset_(other.offset_)
+  {
+  }
+
+  wchar_t operator*()
+  {
+    std::size_t n = size();
+    if (n == 0 || n > 6)
+      return -1;
+
+    const unsigned char* it = data_ + offset_;
+    wchar_t ch = (*it++) & detail::mask[n];
+    for (std::size_t i = 1; i < n; ++i)
+    {
+      ch <<= 6;
+      ch |= (*it++) & 0x3F;
+    }
+
+    return ch;
+  }
+
+  iterator& operator++()
+  {
+    offset_ += size();
+    return *this;
+  }
+
+  iterator operator++(int)
+  {
+    iterator copy(*this);
+    operator++();
+    return copy;
+  }
+
+  bool operator==(const iterator& it)
+  {
+    return
+      data_ + offset_ ==
+      it.data_ + it.offset_;
+  }
+
+  bool operator!=(const iterator& it)
+  {
+    return
+      data_ + offset_ !=
+      it.data_ + it.offset_;
+  }
+
+private:
+
+  int size()
+  {
+    unsigned char ch = data_[offset_];
+    if (ch == 0)
+      return 0;
+    else if (ch < 192)
+      return 1;
+    else if (ch < 224)
+      return 2;
+    else if (ch < 240)
+      return 3;
+    else if (ch < 248)
+      return 4;
+    else if (ch < 252)
+      return 5;
+    else if (ch < 254)
+      return 6;
+
+    // TODO: on error?
+    return 1;
+  }
+
+private:
+
+  const unsigned char* data_;
+  std::size_t offset_;
+};
+
+} // namespace utf8
+} // namespace sourcetools
+
+#endif /* SOURCETOOLS_UTF8_UTF8_H */
diff --git a/man/read.Rd b/man/read.Rd
new file mode 100644
index 0000000..a3223f3
--- /dev/null
+++ b/man/read.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sourcetools.R
+\name{read}
+\alias{read}
+\alias{read_bytes}
+\alias{read_lines}
+\alias{read_lines_bytes}
+\title{Read the Contents of a File}
+\usage{
+read(path)
+
+read_lines(path)
+
+read_bytes(path)
+
+read_lines_bytes(path)
+}
+\arguments{
+\item{path}{A file path.}
+}
+\description{
+Read the contents of a file into a string (or, in the case of
+\code{read_lines}, a vector of strings).
+}
+
diff --git a/man/tokenize-methods.Rd b/man/tokenize-methods.Rd
new file mode 100644
index 0000000..4f1da94
--- /dev/null
+++ b/man/tokenize-methods.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sourcetools.R
+\name{tokenize_file}
+\alias{tokenize}
+\alias{tokenize_file}
+\alias{tokenize_string}
+\title{Tokenize R Code}
+\usage{
+tokenize_file(path)
+
+tokenize_string(string)
+
+tokenize(file = "", text = NULL)
+}
+\arguments{
+\item{file, path}{A file path.}
+
+\item{text, string}{\R code as a character vector of length one.}
+}
+\value{
+A \code{data.frame} with the following columns:
+
+\tabular{ll}{
+\code{value}  \tab The token's contents, as a string.     \cr
+\code{row}    \tab The row where the token is located.    \cr
+\code{column} \tab The column where the token is located. \cr
+\code{type}   \tab The token type, as a string.           \cr
+}
+}
+\description{
+Tools for tokenizing \R code.
+}
+\note{
+Line numbers are determined by existence of the \code{\\n}
+line feed character, under the assumption that code being tokenized
+will use either \code{\\n} to indicate newlines (as on modern
+Unix systems), or \code{\\r\\n} as on Windows.
+}
+\examples{
+tokenize_string("x <- 1 + 2")
+}
+
diff --git a/src/Makevars b/src/Makevars
new file mode 100644
index 0000000..4340efb
--- /dev/null
+++ b/src/Makevars
@@ -0,0 +1 @@
+PKG_CPPFLAGS = -I../inst/include
diff --git a/src/Makevars.win b/src/Makevars.win
new file mode 100644
index 0000000..4340efb
--- /dev/null
+++ b/src/Makevars.win
@@ -0,0 +1 @@
+PKG_CPPFLAGS = -I../inst/include
diff --git a/src/Reader.cpp b/src/Reader.cpp
new file mode 100644
index 0000000..331ba0c
--- /dev/null
+++ b/src/Reader.cpp
@@ -0,0 +1,88 @@
+#include <cstring>
+
+#include <sourcetools/read/read.h>
+#include <sourcetools/r/r.h>
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+extern "C" SEXP sourcetools_read(SEXP absolutePathSEXP)
+{
+  const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+
+  std::string contents;
+  bool result = sourcetools::read(absolutePath, &contents);
+  if (!result)
+  {
+    Rf_warning("Failed to read file");
+    return R_NilValue;
+  }
+
+  sourcetools::r::Protect protect;
+  SEXP resultSEXP = protect(Rf_allocVector(STRSXP, 1));
+  SET_STRING_ELT(resultSEXP, 0, Rf_mkCharLen(contents.c_str(), contents.size()));
+  return resultSEXP;
+}
+
+extern "C" SEXP sourcetools_read_lines(SEXP absolutePathSEXP)
+{
+  const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+
+  std::vector<std::string> lines;
+  bool result = sourcetools::read_lines(absolutePath, &lines);
+  if (!result)
+  {
+    Rf_warning("Failed to read file");
+    return R_NilValue;
+  }
+
+  std::size_t n = lines.size();
+  sourcetools::r::Protect protect;
+  SEXP resultSEXP = protect(Rf_allocVector(STRSXP, n));
+  for (std::size_t i = 0; i < n; ++i)
+    SET_STRING_ELT(resultSEXP, i, Rf_mkCharLen(lines[i].c_str(), lines[i].size()));
+  return resultSEXP;
+}
+
+extern "C" SEXP sourcetools_read_bytes(SEXP absolutePathSEXP)
+{
+  const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+
+  std::string contents;
+  bool result = sourcetools::read(absolutePath, &contents);
+  if (!result)
+  {
+    Rf_warning("Failed to read file");
+    return R_NilValue;
+  }
+
+  sourcetools::r::Protect protect;
+  SEXP resultSEXP = protect(Rf_allocVector(RAWSXP, contents.size()));
+  std::memcpy(RAW(resultSEXP), contents.c_str(), contents.size());
+  return resultSEXP;
+}
+
+extern "C" SEXP sourcetools_read_lines_bytes(SEXP absolutePathSEXP)
+{
+  const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+
+  std::vector<std::string> lines;
+  bool result = sourcetools::read_lines(absolutePath, &lines);
+  if (!result)
+  {
+    Rf_warning("Failed to read file");
+    return R_NilValue;
+  }
+
+  std::size_t n = lines.size();
+  sourcetools::r::Protect protect;
+  SEXP resultSEXP = protect(Rf_allocVector(VECSXP, n));
+  for (std::size_t i = 0; i < n; ++i)
+  {
+    SEXP rawSEXP = Rf_allocVector(RAWSXP, lines[i].size());
+    std::memcpy(RAW(rawSEXP), lines[i].c_str(), lines[i].size());
+    SET_VECTOR_ELT(resultSEXP, i, rawSEXP);
+  }
+  return resultSEXP;
+}
diff --git a/src/Tokenizer.cpp b/src/Tokenizer.cpp
new file mode 100644
index 0000000..1fe8bb1
--- /dev/null
+++ b/src/Tokenizer.cpp
@@ -0,0 +1,96 @@
+#include <sourcetools.h>
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+namespace sourcetools {
+namespace {
+
+void asDataFrame(SEXP listSEXP, int n)
+{
+  r::Protect protect;
+  SEXP classSEXP = protect(Rf_mkString("data.frame"));
+  Rf_setAttrib(listSEXP, R_ClassSymbol, classSEXP);
+
+  SEXP rownamesSEXP = protect(Rf_allocVector(INTSXP, 2));
+  INTEGER(rownamesSEXP)[0] = NA_INTEGER;
+  INTEGER(rownamesSEXP)[1] = -n;
+  Rf_setAttrib(listSEXP, R_RowNamesSymbol, rownamesSEXP);
+}
+
+SEXP asSEXP(const std::vector<tokens::Token>& tokens)
+{
+  r::Protect protect;
+  std::size_t n = tokens.size();
+  SEXP resultSEXP = protect(Rf_allocVector(VECSXP, 4));
+
+  // Set vector elements
+  SEXP valueSEXP = protect(Rf_allocVector(STRSXP, n));
+  SET_VECTOR_ELT(resultSEXP, 0, valueSEXP);
+  for (std::size_t i = 0; i < n; ++i) {
+    const std::string& contents = tokens[i].contents();
+    SEXP charSEXP = Rf_mkCharLen(contents.c_str(), contents.size());
+    SET_STRING_ELT(valueSEXP, i, charSEXP);
+  }
+
+  SEXP rowSEXP = protect(Rf_allocVector(INTSXP, n));
+  SET_VECTOR_ELT(resultSEXP, 1, rowSEXP);
+  for (std::size_t i = 0; i < n; ++i)
+    INTEGER(rowSEXP)[i] = tokens[i].row() + 1;
+
+  SEXP columnSEXP = protect(Rf_allocVector(INTSXP, n));
+  SET_VECTOR_ELT(resultSEXP, 2, columnSEXP);
+  for (std::size_t i = 0; i < n; ++i)
+    INTEGER(columnSEXP)[i] = tokens[i].column() + 1;
+
+  SEXP typeSEXP = protect(Rf_allocVector(STRSXP, n));
+  SET_VECTOR_ELT(resultSEXP, 3, typeSEXP);
+  for (std::size_t i = 0; i < n; ++i) {
+    const std::string& type = toString(tokens[i].type());
+    SEXP charSEXP = Rf_mkCharLen(type.c_str(), type.size());
+    SET_STRING_ELT(typeSEXP, i, charSEXP);
+  }
+
+  // Set names
+  SEXP namesSEXP = protect(Rf_allocVector(STRSXP, 4));
+
+  SET_STRING_ELT(namesSEXP, 0, Rf_mkChar("value"));
+  SET_STRING_ELT(namesSEXP, 1, Rf_mkChar("row"));
+  SET_STRING_ELT(namesSEXP, 2, Rf_mkChar("column"));
+  SET_STRING_ELT(namesSEXP, 3, Rf_mkChar("type"));
+
+  Rf_setAttrib(resultSEXP, R_NamesSymbol, namesSEXP);
+
+  asDataFrame(resultSEXP, n);
+
+  return resultSEXP;
+}
+
+} // anonymous namespace
+} // namespace sourcetools
+
+extern "C" SEXP sourcetools_tokenize_file(SEXP absolutePathSEXP)
+{
+  typedef sourcetools::tokens::Token Token;
+
+  const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
+  std::string contents;
+  if (!sourcetools::read(absolutePath, &contents))
+  {
+    Rf_warning("Failed to read file");
+    return R_NilValue;
+  }
+
+  const std::vector<Token>& tokens = sourcetools::tokenize(contents);
+  return sourcetools::asSEXP(tokens);
+}
+
+extern "C" SEXP sourcetools_tokenize_string(SEXP stringSEXP)
+{
+  typedef sourcetools::tokens::Token Token;
+  SEXP charSEXP = STRING_ELT(stringSEXP, 0);
+  const std::vector<Token>& tokens =
+    sourcetools::tokenize(CHAR(charSEXP), Rf_length(charSEXP));
+  return sourcetools::asSEXP(tokens);
+}
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..c610b60
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+if (require("testthat", quietly = TRUE)) {
+  library(sourcetools)
+  test_check("sourcetools")
+}
diff --git a/tests/testthat/helper-utf8.R b/tests/testthat/helper-utf8.R
new file mode 100644
index 0000000..6da2209
--- /dev/null
+++ b/tests/testthat/helper-utf8.R
@@ -0,0 +1,3 @@
+octal <- "\012"
+hex   <- "\xE2\x99\xA5"
+utf8  <- "\u2665"
diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R
new file mode 100644
index 0000000..6c37e20
--- /dev/null
+++ b/tests/testthat/test-read.R
@@ -0,0 +1,30 @@
+context("Reader")
+
+files <- list.files()
+
+test_that("read_lines and readLines agree on output", {
+  for (file in files) {
+    expect_identical(
+      readLines(file),
+      sourcetools::read_lines(file)
+    )
+  }
+})
+
+test_that("read and readChar agree on output", {
+  for (file in files) {
+    expect_identical(
+      readChar(file, file.info(file)$size, TRUE),
+      sourcetools::read(file)
+    )
+  }
+})
+
+test_that("read_bytes and readBin agree on output", {
+  for (file in files) {
+    expect_identical(
+      readBin(file, "raw", file.info(file)$size),
+      sourcetools::read_bytes(file)
+    )
+  }
+})
diff --git a/tests/testthat/test-tokenize.R b/tests/testthat/test-tokenize.R
new file mode 100644
index 0000000..ba0eee2
--- /dev/null
+++ b/tests/testthat/test-tokenize.R
@@ -0,0 +1,165 @@
+context("Tokenizer")
+
+compare_tokens <- function(tokens, expected) {
+
+  if (is.character(tokens))
+    tokens <- tokenize_string(tokens)
+
+  expect_true(
+    nrow(tokens) == length(expected),
+    "different number of tokens"
+  )
+
+  for (i in 1:nrow(tokens)) {
+    expect_true(
+      tokens$value[[i]] == expected[[i]],
+      paste0("expected token '", tokens$value[[i]], "'; got '", expected[[i]], "'")
+    )
+  }
+
+}
+
+test_that("Operators are tokenized correctly", {
+
+  operators <- c(
+    "::", ":::", "$", "@", "[", "[[", "^", "-", "+", ":",
+    "*", "/", "+", "-", "<", ">", "<=", ">=", "==", "!=",
+    "!", "&", "&&", "|", "||", "~", "->", "->>", "<-", "<<-",
+    "=", "?", "**", "%%", "%for%"
+  )
+
+  tokenized <- tokenize_string(paste(operators, collapse = " "))
+
+  for (operator in operators) {
+    tokens <- tokenize_string(operator)
+    expect_true(nrow(tokens) == 1, paste("expected a single token ('", operator, "')"))
+  }
+})
+
+test_that("Numbers are tokenized correctly", {
+
+  numbers <- c("1", "1.0", "0.1", ".1", "0.1E1", "1L", "1.0L", "1.5L",
+               "1E1", "1E-1", "1E-1L", ".100E-105L", "0.", "100.",
+               "1e+09", "1e+90", "1e-90", "1e-00000000000000009")
+
+  for (number in numbers) {
+    tokens <- tokenize_string(number)
+    expect_true(nrow(tokens) == 1, paste("expected a single token ('", number, "')", sep = ""))
+    token <- as.list(tokens[1, ])
+    expect_true(token$type == "number", paste("expected a number ('", token$type, "')", sep = ""))
+  }
+
+})
+
+test_that("The tokenizer accepts UTF-8 symbols", {
+  expect_true(nrow(tokenize_string("鬼")) == 1)
+})
+
+test_that("The tokenizer works correctly", {
+
+  # TODO: Should newlines be absorbed as part of the comment string?
+  tokens <- tokenize_string("# A Comment\n")
+  expected <- "# A Comment\n"
+  compare_tokens(tokens, expected)
+
+  tokens <- tokenize_string("a <- 1 + 2\n")
+  compare_tokens(
+    tokens,
+    c("a", " ", "<-", " ", "1", " ", "+", " ", "2", "\n")
+  )
+
+  compare_tokens(
+    tokenize_string("a<-1"),
+    c("a", "<-", "1")
+  )
+
+  # NOTE: '-' sign tokenized separately from number
+  compare_tokens(
+    tokenize_string("a< -1"),
+    c("a", "<", " ", "-", "1")
+  )
+
+  compare_tokens("1.0E5L", "1.0E5L")
+  compare_tokens(".1", ".1")
+  compare_tokens("'\\''", "'\\''")
+  compare_tokens(".a", ".a")
+  compare_tokens("...", "...")
+  compare_tokens(":=", ":=")
+  compare_tokens("x ** 2", c("x", " ", "**", " ", "2"))
+
+})
+
+test_that("`[[` and `[` are tokenized correctly", {
+
+  compare_tokens("x[[1]]", c("x", "[[", "1", "]]"))
+
+  # not really valid R code, but the tokenizer should still
+  # get it right
+  compare_tokens("[[[]]]", c("[[", "[", "]", "]]"))
+
+  compare_tokens(
+    "x[[a[b[[c[1]]]]]]",
+    c("x", "[[", "a", "[", "b", "[[", "c", "[", "1",
+      "]", "]]", "]", "]]")
+  )
+
+})
+
+test_that("Failures during number tokenization is detected", {
+  tokens <- tokenize_string("1.5E---")
+  expect_true(tokens$type[[1]] == "invalid")
+})
+
+test_that("invalid number e.g. 1E1.5 tokenized as single entity", {
+  tokens <- tokenize_string("1E1.5")
+  expect_true(nrow(tokens) == 1)
+  expect_true(tokens$type[[1]] == "invalid")
+})
+
+test_that("keywords are tokenized as keywords", {
+
+  keywords <- c("if", "else", "repeat", "while", "function",
+                "for", "in", "next", "break",
+                "TRUE", "FALSE", "NULL", "Inf", "NaN", "NA",
+                "NA_integer_", "NA_real_", "NA_complex_", "NA_character_")
+
+  tokens <- lapply(keywords, function(keyword) {
+    tokenize_string(keyword)[1, ]
+  })
+
+  types <- unlist(lapply(tokens, `[[`, "type"))
+  expect_true(all(types == "keyword"))
+})
+
+test_that("comments without a trailing newline are tokenized", {
+  tokens <- tokenize_string("# abc")
+  expect_identical(tokens$type, "comment")
+})
+
+test_that("tokenization errors handled correctly", {
+  # previously, these reported an error where a NUL
+  # byte was accidentally included as part of the
+  # token value
+  tokenize_string("`abc")
+  tokenize_string("'abc")
+  tokenize_string("\"abc")
+  tokenize_string("%abc")
+})
+
+test_that("files in packages are tokenized without errors", {
+  skip_on_cran()
+
+  paths <- list.dirs("~/git", full.names = TRUE, recursive = FALSE)
+  packages <- paths[file.exists(file.path(paths, "DESCRIPTION"))]
+  R <- file.path(packages, "R")
+
+  for (dir in R) {
+    files <- list.files(dir, pattern = "R$", full.names = TRUE)
+    for (file in files) {
+      tokens <- tokenize_file(file)
+      errors <- tokens$type == "invalid"
+      expect_true(all(errors == FALSE))
+    }
+  }
+
+})

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-sourcetools.git



More information about the debian-med-commit mailing list