[med-svn] [r-cran-lexrankr] 03/05: New upstream version 0.4.0

Andreas Tille tille at debian.org
Tue Oct 10 15:23:41 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository r-cran-lexrankr.

commit 27e74995630c1229cebd31db1e331d39df2b094f
Author: Andreas Tille <tille at debian.org>
Date:   Tue Oct 10 17:22:23 2017 +0200

    New upstream version 0.4.0
---
 DESCRIPTION                                        |  19 ++
 LICENSE                                            |   2 +
 MD5                                                |  43 ++++
 NAMESPACE                                          |  16 ++
 NEWS.md                                            |  18 ++
 R/RcppExports.R                                    |   7 +
 R/bind_lexrank.R                                   | 119 +++++++++
 R/lexRank.R                                        |  54 +++++
 R/lexRankFromSimil.R                               |  75 ++++++
 R/sentenceParse.R                                  |  37 +++
 R/sentenceSimil.R                                  | 101 ++++++++
 R/sentenceTokenParse.R                             |  35 +++
 R/sentence_parser.R                                |  10 +
 R/tokenize.R                                       |  69 ++++++
 R/unnest_sentences.R                               |  57 +++++
 README.md                                          | 213 ++++++++++++++++
 build/vignette.rds                                 | Bin 0 -> 240 bytes
 debian/changelog                                   |   5 -
 debian/compat                                      |   1 -
 debian/control                                     |  37 ---
 debian/copyright                                   |  33 ---
 debian/docs                                        |   2 -
 debian/rules                                       |   5 -
 debian/source/format                               |   1 -
 debian/tests/control                               |   9 -
 debian/tests/run-unit-test                         |  17 --
 debian/tests/vignette                              |   7 -
 debian/upstream/metadata                           |  10 -
 debian/watch                                       |   2 -
 inst/doc/Analyzing_Twitter_with_LexRankr.html      | 268 +++++++++++++++++++++
 inst/doc/Analyzing_Twitter_with_LexRankr.html.asis |   4 +
 man/bind_lexrank_.Rd                               |  73 ++++++
 man/lexRank.Rd                                     |  58 +++++
 man/lexRankFromSimil.Rd                            |  41 ++++
 man/sentenceParse.Rd                               |  25 ++
 man/sentenceSimil.Rd                               |  32 +++
 man/sentenceTokenParse.Rd                          |  36 +++
 man/sentence_parser.Rd                             |  18 ++
 man/tokenize.Rd                                    |  32 +++
 man/unnest_sentences_.Rd                           |  43 ++++
 src/RcppExports.cpp                                |  18 ++
 src/idfCosineSimil.cpp                             |  39 +++
 src/register_routines.c                            |  22 ++
 tests/testthat.R                                   |   4 +
 tests/testthat/test-bind_lexrank.R                 | 124 ++++++++++
 tests/testthat/test-bind_lexrank_.R                | 124 ++++++++++
 tests/testthat/test-idfCosine.R                    |  45 ++++
 tests/testthat/test-lexRank.R                      |  40 +++
 tests/testthat/test-lexRankFromSimil.R             |  78 ++++++
 tests/testthat/test-sentenceParse.R                |  47 ++++
 tests/testthat/test-sentenceSimil.R                |  70 ++++++
 tests/testthat/test-sentenceTokenParse.R           |  40 +++
 tests/testthat/test-tokenize.R                     | 144 +++++++++++
 tests/testthat/test-unnest_sentences.R             |  55 +++++
 tests/testthat/test-unnest_sentences_.R            |  56 +++++
 .../Analyzing_Twitter_with_LexRankr.html.asis      |   4 +
 56 files changed, 2415 insertions(+), 129 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..467c271
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,19 @@
+Package: lexRankr
+Type: Package
+Title: Extractive Summarization of Text with the LexRank Algorithm
+Version: 0.4.0
+Author: Adam Spannbauer <spannbaueradam at gmail.com>
+Maintainer: Adam Spannbauer <spannbaueradam at gmail.com>
+Description: An R implementation of the LexRank algorithm described by G. Erkan and D. R. Radev (2004) <DOI:10.1613/jair.1523>.
+License: MIT + file LICENSE
+URL: https://github.com/AdamSpannbauer/lexRankr/
+LazyData: TRUE
+RoxygenNote: 5.0.1
+Imports: dplyr, tidyr, magrittr, stringr, SnowballC, igraph, tm, Rcpp
+LinkingTo: Rcpp
+Suggests: covr, testthat, R.rsp
+VignetteBuilder: R.rsp
+NeedsCompilation: yes
+Packaged: 2017-03-02 14:27:17 UTC; u775749
+Repository: CRAN
+Date/Publication: 2017-03-02 16:21:22
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..5c6d95b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2016
+COPYRIGHT HOLDER: Adam Spannbauer
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..ec84816
--- /dev/null
+++ b/MD5
@@ -0,0 +1,43 @@
+b09fa890f724307b7a666da0ae18b966 *DESCRIPTION
+b458a8dd25dc372c4aeefb1b28542c7c *LICENSE
+3dfbe6abf64ff40c99c68b366de7ecac *NAMESPACE
+7ddd5f4b328baf74b02ff7fbf3360715 *NEWS.md
+8e9822c41154fdc3672feb037b3de49a *R/RcppExports.R
+39017020f0c1b46e32112b5a6ef3bf5a *R/bind_lexrank.R
+b5b5b38a51a0744b75921752089613ca *R/lexRank.R
+a32b76ade6f7d0c12ef34bb16fb357e9 *R/lexRankFromSimil.R
+9b512a4022f5cb870154ec8e59271f53 *R/sentenceParse.R
+308eaca464fb90969f0e02c4ec70070d *R/sentenceSimil.R
+6c276fb56c0ee80e3f74e59eb6e49c41 *R/sentenceTokenParse.R
+c6d46d77b1da31caeff8e47bc8ec7a1d *R/sentence_parser.R
+73ae0bc326c31900e2dde09b20af5aba *R/tokenize.R
+06a7631f18bac7fbf85675ad22d5abba *R/unnest_sentences.R
+af5d573f1dc77a5aab14b8cd6e8747aa *README.md
+e7c8e205a5d20da21a8a2656f9ef1d32 *build/vignette.rds
+85e967ef3b2094b02fd3fac0d0b1968a *inst/doc/Analyzing_Twitter_with_LexRankr.html
+66e47bc11bb84e17ca6e0e3c79e18d73 *inst/doc/Analyzing_Twitter_with_LexRankr.html.asis
+861cae03e1e1404451d636908c50fd4a *man/bind_lexrank_.Rd
+61567a82d7fb41bf03d0c46ac87da8d0 *man/lexRank.Rd
+3409c9b9026182386a6123fc90aa39e8 *man/lexRankFromSimil.Rd
+59d7a80fbb8dd83bdcd3b6e159db33f6 *man/sentenceParse.Rd
+55d9073db96b6c6ecca25de323b40989 *man/sentenceSimil.Rd
+9bbcd1ae27f2c5d8c1fcb6f0c8a72cfb *man/sentenceTokenParse.Rd
+60c25a4a9d91c9c605bb6a1daaa13e3e *man/sentence_parser.Rd
+2b28bf4eb65e4369816f5c42d86b9a73 *man/tokenize.Rd
+52c68d97fe2308c39781617d0c1264e5 *man/unnest_sentences_.Rd
+a4ba729353921f27dd1033666ba3d5a7 *src/RcppExports.cpp
+2cd12945f1ba427de73a8b690ffd4615 *src/idfCosineSimil.cpp
+b911f4ffbf1ed05047d75ba046bb4b54 *src/register_routines.c
+f2f13352ba2520410e0befed450da512 *tests/testthat.R
+fdb5c9a4e1b4fdc945f60e13e357d2fa *tests/testthat/test-bind_lexrank.R
+8cbdc84080d998fb116e1879865551cf *tests/testthat/test-bind_lexrank_.R
+af81d75390b209670947a0e5aa241719 *tests/testthat/test-idfCosine.R
+7e63f7f2a086f6063c363f4c566e25e4 *tests/testthat/test-lexRank.R
+a2a11099148d09929fb88c23d0291cbb *tests/testthat/test-lexRankFromSimil.R
+47c695bc1da291f010842fbcdbb1cb4a *tests/testthat/test-sentenceParse.R
+eae78dcfea05975cb1f37ab8659f91f1 *tests/testthat/test-sentenceSimil.R
+95d266d769c968dbc8680bf18925d02d *tests/testthat/test-sentenceTokenParse.R
+9aeddf2207014720bc16cf27c5c5b3a0 *tests/testthat/test-tokenize.R
+b8cb8bf2c1ab67737de55e66fe433cac *tests/testthat/test-unnest_sentences.R
+402d44fcccf8e09d18cbe4a5a6656810 *tests/testthat/test-unnest_sentences_.R
+66e47bc11bb84e17ca6e0e3c79e18d73 *vignettes/Analyzing_Twitter_with_LexRankr.html.asis
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..998b84a
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,16 @@
+# Generated by roxygen2: do not edit by hand
+
+export(bind_lexrank)
+export(bind_lexrank_)
+export(lexRank)
+export(lexRankFromSimil)
+export(sentenceParse)
+export(sentenceSimil)
+export(sentenceTokenParse)
+export(tokenize)
+export(unnest_sentences)
+export(unnest_sentences_)
+importFrom(Rcpp,sourceCpp)
+importFrom(magrittr,"%>%")
+importFrom(utils,combn)
+useDynLib(lexRankr)
diff --git a/NEWS.md b/NEWS.md
new file mode 100644
index 0000000..30fac17
--- /dev/null
+++ b/NEWS.md
@@ -0,0 +1,18 @@
+# lexRankr 0.4.0
+
+* added functions `unnest_sentences` and `unnest_sentences_` to parse sentences in a dataframe  following tidy data principles
+* added functions `bind_lexrank` and `bind_lexrank_` to calculate lexrank scores for sentences in a dataframe following tidy data principles (`unnest_sentences` & `bind_lexrank` can be used on a df in a magrittr pipeline)
+* added vignette for using lexrank to analyze tweets
+
+# lexRankr 0.3.0
+
+* sentence similarity from `sentenceSimil` now calculated using Rcpp.  Improves speed by ~25%-30% over old implementation using `proxy` package
+
+
+# lexRankr 0.2.0
+
+* Added logic to avoid naming conflicts in proxy::pr_DB in `sentenceSimil` (#1, @AdamSpannbauer)
+
+* Added check and error for cases where no sentences above threshold in `lexRankFromSimil` (#2, @AdamSpannbauer)
+
+* `tokenize` now has stricter punctuation removal.  Removes all non-alphnumeric characters as opposed to removing `[:punct:]`
diff --git a/R/RcppExports.R b/R/RcppExports.R
new file mode 100644
index 0000000..68b3e56
--- /dev/null
+++ b/R/RcppExports.R
@@ -0,0 +1,7 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+idfCosineSimil <- function(mat) {
+    .Call('lexRankr_idfCosineSimil', PACKAGE = 'lexRankr', mat)
+}
+
diff --git a/R/bind_lexrank.R b/R/bind_lexrank.R
new file mode 100644
index 0000000..6c0d890
--- /dev/null
+++ b/R/bind_lexrank.R
@@ -0,0 +1,119 @@
+#' Bind lexrank scores to a dataframe of text
+
+#' @description Bind lexrank scores to a dataframe of sentences or to a dataframe of tokens with sentence ids
+#' @param tbl dataframe containing column of sentences to be lexranked
+#' @param text name of column containing sentences or tokens to be lexranked
+#' @param doc_id name of column containing document ids corresponding to \code{text}
+#' @param sent_id Only needed if \code{level} is "tokens". name of column containing sentence ids corresponding to \code{text}
+#' @param level the parsed level of the text column to be lexranked.  i.e. is \code{text} a column of "sentences" or "tokens"?  The "tokens" level is provided to allow users to implement custom tokenization.  Note: even if the input \code{level} is "tokens" lexrank scores are assigned at the sentence level. 
+#' @param threshold The minimum simililarity value a sentence pair must have to be represented in the graph where lexRank is calculated.
+#' @param usePageRank \code{TRUE} or \code{FALSE} indicating whether or not to use the page rank algorithm for ranking sentences.  If \code{FALSE}, a sentences unweighted centrality will be used as the rank.  Defaults to \code{TRUE}.
+#' @param damping The damping factor to be passed to page rank algorithm.  Ignored if \code{usePageRank} is \code{FALSE}.
+#' @param continuous \code{TRUE} or \code{FALSE} indicating whether or not to use continuous LexRank.  Only applies if \code{usePageRank==TRUE}.  If \code{TRUE}, \code{threshold} will be ignored and lexRank will be computed using a weighted graph representation of the sentences. Defaults to \code{FALSE}.
+#' @param ... tokenizing options to be passed to lexRankr::tokenize.  Ignored if \code{level} is "sentences"
+#' @return A dataframe with an additional column of lexrank scores (column is given name lexrank)
+#' @examples
+#' library(dplyr)
+#' 
+#' df <- dplyr::tibble(doc_id = 1:3, 
+#'                     text = c("Testing the system. Second sentence for you.", 
+#'                              "System testing the tidy documents df.", 
+#'                              "Documents will be parsed and lexranked."))
+#' 
+#' df %>% 
+#'   unnest_sentences(sents, text) %>% 
+#'   bind_lexrank(sents, doc_id, level = 'sentences')
+#' 
+#' df %>% 
+#'   unnest_sentences(sents, text) %>% 
+#'   bind_lexrank_("sents", "doc_id", level = 'sentences')
+#' 
+#' df <- data.frame(doc_id  = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
+#'                              2, 2, 2, 3, 3, 3, 3, 3, 3), 
+#'                  sent_id = c(1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 
+#'                              1, 1, 1, 1, 1, 1, 1, 1, 1), 
+#'                  tokens = c("testing", "the", "system", "second", 
+#'                             "sentence", "for", "you", "system", 
+#'                             "testing", "the", "tidy", "documents", 
+#'                             "df", "documents", "will", "be", "parsed", 
+#'                             "and", "lexranked"),
+#'                  stringsAsFactors = FALSE)
+#' 
+#' df %>% 
+#'   bind_lexrank(tokens, doc_id, sent_id, level = 'tokens')
+#' @export
+bind_lexrank_ <- function(tbl, text, doc_id, sent_id=NULL, level=c("sentences", "tokens"), threshold=.2, usePageRank=TRUE, damping=0.85, continuous=FALSE, ...) {
+  if(!is.data.frame(tbl)) stop("tbl must be a dataframe")
+  if(!(text %in% names(tbl))) stop("text column not found in tbl")
+  if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl")
+  if(!is.character(level)) stop("level must be character")
+  if(length(level) > 1) {
+    warning("only first element of level will be used")
+    level = level[1]
+  }
+  if(!(level %in% c("sentences", "tokens"))) stop("invalid value of level; accepted values for level are 'sentences' and 'tokens'")
+  if(level == "tokens") {
+    if(is.null(sent_id)) stop("sent_id must be provided when level is 'tokens'")
+    if(!(sent_id %in% names(tbl))) stop("sent_id column not found in tbl")
+    sent_ids <- tbl[[sent_id]]
+  } else {
+    sent_ids <- 1:nrow(tbl)
+  }
+  
+  tbl_class     <- class(tbl)
+  doc_id_class  <- class(tbl[[doc_id]])
+  
+  uuid_kinda <- paste0(c("a",sample(c(letters[1:6],0:9),30,replace=TRUE)), collapse = "")
+  uuid_sep   <- paste0("__", uuid_kinda,"__")
+  
+  doc_sent_ids <- paste0(tbl[[doc_id]], uuid_sep, sent_ids)
+  
+  if(level=="sentences") {
+    sent_id <- uuid_kinda
+    tokenDfList <- lapply(seq_along(tbl[[text]]), function(i) {
+      sentVec   <- tbl[[text]][i]
+      tokenList <- tokenize(text = sentVec, ...)
+      subTokenDfList <- lapply(seq_along(tokenList), function(j) {
+        data.frame(docId=tbl[[doc_id]][i], sentenceId=doc_sent_ids[i], token=tokenList[[j]], stringsAsFactors = FALSE)
+      })
+      dplyr::bind_rows(subTokenDfList)
+    })
+    
+    tokenDf <- dplyr::bind_rows(tokenDfList) %>%
+      dplyr::filter(!is.na(token))
+  } else {
+    tokenDf <- dplyr::tibble(docId=tbl[[doc_id]], sentenceId=doc_sent_ids, token=tbl[[text]])
+  }
+  
+  similDf <- sentenceSimil(tokenDf$sentenceId, tokenDf$token, tokenDf$docId)
+  topSentIdsDf <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, threshold=threshold, n=Inf, returnTies=TRUE, usePageRank=usePageRank, damping=damping, continuous=continuous)
+  lex_lookup <- stringr::str_split_fixed(topSentIdsDf$sentenceId, uuid_sep, n=2) %>% 
+    dplyr::as_data_frame() %>% 
+    stats::setNames(c(doc_id, sent_id))
+  class(lex_lookup[[doc_id]])  <- doc_id_class
+  
+  lex_lookup$lexrank <- topSentIdsDf$value
+  
+  if(level=="tokens") {
+    class(lex_lookup[[sent_id]]) <- class(tbl[[sent_id]])
+    tbl_out <- dplyr::left_join(tbl, lex_lookup, by=c(doc_id, sent_id))
+  } else {
+    tbl[[uuid_kinda]] <- as.character(sent_ids)
+    tbl_out <- dplyr::left_join(tbl, lex_lookup, by=c(doc_id, uuid_kinda))
+    tbl_out[[uuid_kinda]] <- NULL
+  }
+  
+  class(tbl_out) <- tbl_class
+  tbl_out
+}
+
+#' @rdname bind_lexrank_
+#' @export
+bind_lexrank <- function(tbl, text, doc_id, sent_id=NULL, level=c("sentences", "tokens"), threshold=.2, usePageRank=TRUE, damping=0.85, continuous=FALSE, ...) {
+  text_str    <- as.character(substitute(text))
+  doc_id_str  <- as.character(substitute(doc_id))
+  sent_id_str <- substitute(sent_id)
+  if (!is.null(sent_id_str)) sent_id_str <- as.character(sent_id_str)
+  
+  bind_lexrank_(tbl, text_str, doc_id_str, sent_id=sent_id_str, level=level, threshold=threshold, usePageRank=usePageRank, damping=damping, continuous=continuous, ...)
+}
diff --git a/R/lexRank.R b/R/lexRank.R
new file mode 100644
index 0000000..42dd80b
--- /dev/null
+++ b/R/lexRank.R
@@ -0,0 +1,54 @@
+utils::globalVariables(c("value"))
+#' Extractive text summarization with LexRank
+
+#' @description Compute LexRanks from a vector of documents using the page rank algorithm or degree centrality the methods used to compute lexRank are discussed in "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization."
+#' @param text A character vector of documents to be cleaned and processed by the LexRank algorithm
+#' @param docId A vector of document IDs with length equal to the length of \code{text}.  If \code{docId == "create"} then doc IDs will be created as an index from 1 to \code{n}, where \code{n} is the length of \code{text}.
+#' @param threshold The minimum simil value a sentence pair must have to be represented in the graph where lexRank is calculated.
+#' @param n The number of sentences to return as the extractive summary.  The function will return the top \code{n} lexRanked sentences.  See \code{returnTies} for handling ties in lexRank.
+#' @param returnTies \code{TRUE} or \code{FALSE} indicating whether or not to return greater than \code{n} sentence IDs if there is a tie in lexRank.  If \code{TRUE}, the returned number of sentences will not be limited to \code{n}, but rather will return every sentece with a top 3 score.  If \code{FALSE}, the returned number of sentences will be \code{<=n}. Defaults to \code{TRUE}.
+#' @param usePageRank \code{TRUE} or \code{FALSE} indicating whether or not to use the page rank algorithm for ranking sentences.  If \code{FALSE}, a sentences unweighted centrality will be used as the rank.  Defaults to \code{TRUE}.
+#' @param damping The damping factor to be passed to page rank algorithm.  Ignored if \code{usePageRank} is \code{FALSE}.
+#' @param continuous \code{TRUE} or \code{FALSE} indicating whether or not to use continuous LexRank.  Only applies if \code{usePageRank==TRUE}.  If \code{TRUE}, \code{threshold} will be ignored and lexRank will be computed using a weighted graph representation of the sentences. Defaults to \code{FALSE}.
+#' @param sentencesAsDocs \code{TRUE} or \code{FALSE}, indicating whether or not to treat sentences as documents when calculating tfidf scores for similarity. If \code{TRUE}, inverse document frequency will be calculated as inverse sentence frequency (useful for single document extractive summarization).
+#' @param removePunc \code{TRUE} or \code{FALSE} indicating whether or not to remove punctuation from text while tokenizing.  If \code{TRUE}, puncuation will be removed.  Defaults to \code{TRUE}.
+#' @param removeNum \code{TRUE} or \code{FALSE} indicating whether or not to remove numbers from text while tokenizing.  If \code{TRUE}, numbers will be removed.  Defaults to \code{TRUE}.
+#' @param toLower \code{TRUE} or \code{FALSE} indicating whether or not to coerce all of text to lowercase while tokenizing.  If \code{TRUE}, \code{text} will be coerced to lowercase.  Defaults to \code{TRUE}.
+#' @param stemWords \code{TRUE} or \code{FALSE} indicating whether or not to stem resulting tokens.  If \code{TRUE}, the outputted tokens will be tokenized using \code{SnowballC::wordStem()}.  Defaults to \code{TRUE}.
+#' @param rmStopWords \code{TRUE}, \code{FALSE}, or character vector of stopwords to remove from tokens. If \code{TRUE}, words in \code{tm::stopwords("SMART")} will be removed prior to stemming. If \code{FALSE}, no stopword removal will occur. If a character vector is passed, this vector will be used as the list of stopwords to be removed.  Defaults to \code{TRUE}.
+#' @param Verbose \code{TRUE} or \code{FALSE} indicating whether or not to \code{cat} progress messages to the console while running.  Defaults to \code{TRUE}.
+
+#' @return A 2 column dataframe with columns \code{sentenceId} and \code{value}. \code{sentence} contains the ids of the top \code{n} sentences in descending order by \code{value}. \code{value} contains page rank score (if \code{usePageRank==TRUE}) or degree centrality (if \code{usePageRank==FALSE}).
+#' @references \url{http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html}
+#' @examples
+#' lexRank(c("This is a test.","Tests are fun.",
+#' "Do you think the exam will be hard?","Is an exam the same as a test?",
+#' "How many questions are going to be on the exam?"))
+#' @importFrom magrittr "%>%"
+
+#' @export
+lexRank <- function(text, docId = "create", threshold=.2, n=3, returnTies=TRUE, usePageRank=TRUE, damping=0.85, continuous=FALSE, sentencesAsDocs=FALSE, removePunc=TRUE, removeNum=TRUE, toLower=TRUE, stemWords=TRUE, rmStopWords=TRUE, Verbose=TRUE){
+
+  if(!is.logical(Verbose)) stop("Verbose must be logical")
+  if(length(Verbose) != 1) stop("Verbose must be length 1")
+
+  if(Verbose) cat("Parsing text into sentences and tokens...")
+  sentTokList <- sentenceTokenParse(text=text, docId = docId, removePunc=removePunc, removeNum=removeNum, toLower=toLower, stemWords=stemWords, rmStopWords=rmStopWords)
+  if(Verbose) cat("DONE\n")
+  sentDf <- sentTokList$sentences
+  tokenDf <- sentTokList$tokens
+
+  if(Verbose) cat("Calculating pairwise sentence similarities...")
+  similDf <- sentenceSimil(sentenceId=tokenDf$sentenceId, token=tokenDf$token, docId=tokenDf$docId, sentencesAsDocs=sentencesAsDocs)
+  if(Verbose) cat("DONE\n")
+
+  if(Verbose) cat("Applying LexRank...")
+  topNSents <- lexRankFromSimil(s1=similDf$sent1, s2=similDf$sent2, simil=similDf$similVal, threshold=threshold, n=n, returnTies=returnTies, usePageRank=usePageRank, damping=damping, continuous=continuous)
+  if(Verbose) cat("DONE\nFormatting Output...")
+  returnDf <- sentDf %>%
+    dplyr::inner_join(topNSents, by=c("sentenceId"="sentenceId")) %>%
+    dplyr::arrange(dplyr::desc(value))
+  if(Verbose) cat("DONE\n")
+
+  return(returnDf)
+}
diff --git a/R/lexRankFromSimil.R b/R/lexRankFromSimil.R
new file mode 100644
index 0000000..0ee9bfe
--- /dev/null
+++ b/R/lexRankFromSimil.R
@@ -0,0 +1,75 @@
+utils::globalVariables(c("weight","sentenceId","degree","degRank"))
+#' Compute LexRanks from pairwise sentence similarities
+
+#' @description Compute LexRanks from sentence pair similarities using the page rank algorithm or degree centrality the methods used to compute lexRank are discussed in "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization."
+#' @param s1 A character vector of sentence IDs corresponding to the \code{s2} and \code{simil} arguemants.
+#' @param s2 A character vector of sentence IDs corresponding to the \code{s1} and \code{simil} arguemants.
+#' @param simil A numeric vector of similiarity values that represents the similiarity between the sentences represented by the IDs in \code{s1} and \code{s2}.
+#' @param threshold The minimum simil value a sentence pair must have to be represented in the graph where lexRank is calculated.
+#' @param n The number of sentences to return as the extractive summary.  The function will return the top \code{n} lexRanked sentences.  See \code{returnTies} for handling ties in lexRank.
+#' @param returnTies \code{TRUE} or \code{FALSE} indicating whether or not to return greater than \code{n} sentence IDs if there is a tie in lexRank.  If \code{TRUE}, the returned number of sentences will not be limited to \code{n}, but rather will return every sentece with a top 3 score.  If \code{FALSE}, the returned number of sentences will be \code{<=n}. Defaults to \code{TRUE}.
+#' @param usePageRank \code{TRUE} or \code{FALSE} indicating whether or not to use the page rank algorithm for ranking sentences.  If \code{FALSE}, a sentences unweighted centrality will be used as the rank.  Defaults to \code{TRUE}.
+#' @param damping The damping factor to be passed to page rank algorithm.  Ignored if \code{usePageRank} is \code{FALSE}.
+#' @param continuous \code{TRUE} or \code{FALSE} indicating whether or not to use continuous LexRank.  Only applies if \code{usePageRank==TRUE}.  If \code{TRUE}, \code{threshold} will be ignored and lexRank will be computed using a weighted graph representation of the sentences. Defaults to \code{FALSE}.
+#' @return A 2 column dataframe with columns \code{sentenceId} and \code{value}. \code{sentenceId} contains the ids of the top \code{n} sentences in descending order by \code{value}. \code{value} contains page rank score (if \code{usePageRank==TRUE}) or degree centrality (if \code{usePageRank==FALSE}).
+#' @references \url{http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html}
+#' @examples
+#' lexRankFromSimil(s1=c("d1_1","d1_1","d1_2"), s2=c("d1_2","d2_1","d2_1"), simil=c(.01,.03,.5))
+#' @importFrom magrittr "%>%"
+
+#' @export
+
+lexRankFromSimil <- function(s1, s2, simil, threshold=.2, n=3, returnTies=TRUE, usePageRank=TRUE, damping=0.85, continuous=FALSE) {
+  if(!is.logical(returnTies)) stop("returnTies must be logical")
+  if(length(returnTies) != 1) stop("returnTies must be length 1")
+  if(!is.logical(usePageRank)) stop("usePageRank must be logical")
+  if(length(usePageRank) != 1) stop("usePageRank must be length 1")
+  if(!is.logical(continuous)) stop("continuous must be logical")
+  if(length(continuous) != 1) stop("continuous must be length 1")
+  if(!is.numeric(simil)) stop("simil must be numeric")
+  if(!is.numeric(n)) stop("n must be numeric")
+  if(length(n) != 1) stop("n must be length 1")
+
+  if (length(s1) != length(s2) | length(s1) != length(simil)) stop("s1, s2, & simil must all be the same length")
+  if (sum(simil) == 0) stop("all simil values are zero")
+  if (sum(simil > threshold) == 0) stop("all simil values are below threshold")
+
+  s1 <- as.character(s1)
+  s2 <- as.character(s2)
+
+  if(returnTies) tieMethod <- "min" else if(!returnTies) tieMethod <- "first"
+
+  edges <- dplyr::data_frame(s1=s1, s2=s2, weight=simil)
+
+  if(!continuous | !usePageRank) {
+    if(!is.numeric(threshold)) stop("threshold must be numeric")
+    if(length(threshold) != 1) stop("threshold must be length 1")
+
+    edges <- edges %>%
+      dplyr::filter(weight > threshold) %>%
+      dplyr::select(-weight)
+  }
+
+  if (usePageRank) {
+    if(!is.numeric(damping)) stop("damping must be numeric")
+    if(length(damping) != 1) stop("damping must be length 1")
+
+    sentGraph <- igraph::graph_from_data_frame(edges, directed = FALSE)
+    sentRank <- igraph::page_rank(sentGraph, directed=FALSE)$vector
+    sentRanksRanked <- rank(1/sentRank, ties.method = tieMethod)
+    topCentral <- sentRank[which(sentRanksRanked <= n)]
+    centralDf <- data.frame(sentenceId=names(topCentral), value=topCentral,stringsAsFactors = FALSE)
+    rownames(centralDf) <- NULL
+  } else if(!usePageRank){
+    centralDf <- dplyr::data_frame(sentenceId = c(edges$s1, edges$s2)) %>%
+      dplyr::group_by(sentenceId) %>%
+      dplyr::summarise(degree=n()) %>%
+      dplyr::ungroup() %>%
+      dplyr::arrange(dplyr::desc(degree)) %>%
+      dplyr::mutate(degRank = rank(1/degree, ties.method = tieMethod)) %>%
+      dplyr::filter(degRank <= n) %>%
+      dplyr::select(sentenceId, value=degree)
+    class(centralDf) <- "data.frame"
+  }
+  return(centralDf)
+}
diff --git a/R/sentenceParse.R b/R/sentenceParse.R
new file mode 100644
index 0000000..ab4edea
--- /dev/null
+++ b/R/sentenceParse.R
@@ -0,0 +1,37 @@
+#' Parse text into sentences
+
+#' @description Parse the elements of a character vector into a dataframe of sentences with additional identifiers.
+#' @param text Character vector to be parsed into sentences
+#' @param docId A vector of document IDs with length equal to the length of \code{text}.  If \code{docId == "create"} then doc IDs will be created as an index from 1 to \code{n}, where \code{n} is the length of \code{text}.
+#' @return A data frame with 3 columns and \code{n} rows, where \code{n} is the number of sentences found by the routine.  Column 1: \code{docId} document id for the sentence. Column 2: \code{sentenceId} sentence id for the sentence.  Column 3: \code{sentence} the sentences found in the routine.
+#' @examples
+#' sentenceParse("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA.")
+#' sentenceParse(c("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA."),
+#'                docId=c("d1","d2"))
+#' @importFrom magrittr "%>%"
+
+#' @export
+sentenceParse <- function(text, docId = "create") {
+  if(!is.character(text)) stop("text must be character")
+  if(length(text) < 1) stop("text must be at least length 1")
+  docId <- as.character(docId)
+  if(length(docId)==1 & docId[1]=="create") {
+      createDocIds <- TRUE
+    } else if(length(docId)==length(text)) {
+      createDocIds <- FALSE
+    } else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector")
+
+
+  sentences <- sentence_parser(text)
+  sentenceDfList <- lapply(seq_along(sentences), function(i) {
+    sentVec <- trimws(sentences[[i]])
+    if(createDocIds) {
+      data.frame(docId=i, sentenceId=paste0(i,"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE)
+    } else if(!createDocIds) {
+      data.frame(docId=docId[i], sentenceId=paste0(docId[i],"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE)
+    }
+  })
+  sentenceDf <- dplyr::bind_rows(sentenceDfList)
+  class(sentenceDf) <- "data.frame"
+  return(sentenceDf)
+}
diff --git a/R/sentenceSimil.R b/R/sentenceSimil.R
new file mode 100644
index 0000000..ce7311e
--- /dev/null
+++ b/R/sentenceSimil.R
@@ -0,0 +1,101 @@
+#' @useDynLib lexRankr
+#' @importFrom Rcpp sourceCpp
+NULL
+
+utils::globalVariables(c("n","tf","idf","tfidf","V1","V2","similVal"))
+#' Compute distance between sentences
+
+#' @description Compute distance between sentences using modified idf cosine distance from "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization".  Output can be used as input to \code{\link{lexRankFromSimil}}.
+#' @param sentenceId A character vector of sentence IDs corresponding to the \code{docId} and \code{token} arguemants.
+#' @param token A character vector of tokens corresponding to the \code{docId} and \code{sentenceId} arguemants.
+#' @param docId A character vector of document IDs corresponding to the \code{sentenceId} and \code{token} arguemants.  Can be \code{NULL} if \code{sentencesAsDocs} is \code{TRUE}.
+#' @param sentencesAsDocs \code{TRUE} or \code{FALSE}, indicating whether or not to treat sentences as documents when calculating tfidf scores. If \code{TRUE}, inverse document frequency will be calculated as inverse sentence frequency (useful for single document extractive summarization)
+#' @return A 3 column dataframe of pairwise distances between sentences. Columns: \code{sent1} (sentence id), \code{sent2} (sentence id), & \code{dist} (distance between \code{sent1} and \code{sent2}).
+#' @references \url{http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html}
+#' @examples
+#' sentenceSimil(docId=c("d1","d1","d2","d2"),
+#'                sentenceId=c("d1_1","d1_1","d2_1","d2_1"),
+#'                token=c("i", "ran", "jane", "ran"))
+#' @importFrom utils combn
+#' @importFrom magrittr "%>%"
+
+#' @export
+sentenceSimil <- function(sentenceId, token, docId=NULL, sentencesAsDocs=FALSE){
+  if(!is.logical(sentencesAsDocs)) stop("sentencesAsDocs must be logical")
+  if(length(sentencesAsDocs) != 1) stop("sentencesAsDocs must be length 1")
+
+  if(!sentencesAsDocs & is.null(docId)) stop("docIds must be provided if sentencesAsDocs is FALSE")
+  sentenceId <- as.character(sentenceId)
+  if(!is.character(token)) stop("token must be character")
+  if(length(token) < 1) stop("token must be at least length 1")
+
+  if(sentencesAsDocs) {
+    docId <- sentenceId
+    if(length(docId) != length(sentenceId) | length(docId) != length(token)) stop("docId, sentenceId, & token must all be the same length")
+  } else if (!sentencesAsDocs) {
+    docId <- as.character(docId)
+    if(length(sentenceId) != length(token)) stop("sentenceId & token must be the same length")
+  }
+
+  ndoc <- length(unique(docId))
+  if(ndoc > length(unique(sentenceId))) warning("There are more unique docIds than sentenceIds.  Verify you have passed the correct parameters to the function.")
+
+  tokenDf <- dplyr::data_frame(docId=docId, sentenceId=sentenceId, token=token)
+  stm <- tokenDf %>%
+    dplyr::group_by(docId,token) %>%
+    dplyr::summarise(tf = n()) %>%
+    dplyr::ungroup() %>%
+    dplyr::group_by(token) %>%
+    dplyr::mutate(idf = log(ndoc/n())) %>%
+    dplyr::mutate(tfidf = tf*idf) %>%
+    dplyr::ungroup()
+
+  if(!sentencesAsDocs) {
+    stm <- stm %>%
+      dplyr::right_join(tokenDf, by=c("docId"="docId", "token"="token")) %>%
+      dplyr::select(sentenceId, token, tfidf) %>%
+      dplyr::filter(tfidf > 0) %>%
+      unique()
+  } else if (sentencesAsDocs) {
+    stm <- stm %>%
+      dplyr::select(sentenceId=docId, token, tfidf) %>%
+      dplyr::filter(tfidf > 0) %>%
+      unique()
+  }
+
+  if(nrow(stm)==0) stop("All values in sentence term tfidf matrix are 0.  Similarities would return as NaN")
+  if(length(unique((stm$sentenceId))) == 1) stop("Only one sentence had nonzero tfidf scores.  Similarities would return as NaN")
+
+  stm <- tidyr::spread(stm, key=token, value=tfidf, fill=0, drop=FALSE)
+
+  matRowNames <- stm$sentenceId
+
+  stm <- stm %>%
+    dplyr::select(-sentenceId) %>%
+    as.matrix()
+  rownames(stm) <- matRowNames
+
+  #old non C slow version for idfcosine similarity
+  # idfCosine <- function(x,y) {
+  #   sum(x*y)/(sqrt(sum(x^2))*sqrt(sum(y^2)))
+  # }
+  # prDBname <- "idfCosine"
+  # while (proxy::pr_DB$entry_exists(prDBname)) {
+  #   prDBname <- paste0(prDBname, sample(100:999,1))
+  #   cat(prDBname,"\n")
+  # }
+  # proxy::pr_DB$set_entry(FUN=idfCosine, names=prDBname)
+  # similMat <- proxy::dist(stm, method=prDBname)
+  # proxy::pr_DB$delete_entry(prDBname)
+
+  sentencePairsDf <- sort(rownames(stm)) %>%
+    combn(2) %>%
+    t() %>%
+    as.data.frame(stringsAsFactors=FALSE) %>%
+    dplyr::mutate(similVal = idfCosineSimil(stm)) %>%
+    # dplyr::mutate(similVal = as.numeric(similMat)) %>%
+    dplyr::select(sent1=V1, sent2=V2, similVal)
+  class(sentencePairsDf) <- "data.frame"
+
+  return(sentencePairsDf)
+}
diff --git a/R/sentenceTokenParse.R b/R/sentenceTokenParse.R
new file mode 100644
index 0000000..d68ffd7
--- /dev/null
+++ b/R/sentenceTokenParse.R
@@ -0,0 +1,35 @@
+utils::globalVariables(c("token"))
+#' Parse text into sentences and tokens
+
+#' @description Parse a character vector of documents into into both sentences and a clean vector of tokens.  The resulting output includes IDs for document and sentence for use in other \code{lexRank} functions.
+#' @param text A character vector of documents to be parsed into sentences and tokenized.
+#' @param docId A character vector of document Ids the same length as \code{text}.  If \code{docId=="create"} document Ids will be created.
+#' @param removePunc \code{TRUE} or \code{FALSE} indicating whether or not to remove punctuation from \code{text} while tokenizing.  If \code{TRUE}, puncuation will be removed.  Defaults to \code{TRUE}.
+#' @param removeNum \code{TRUE} or \code{FALSE} indicating whether or not to remove numbers from \code{text} while tokenizing.  If \code{TRUE}, numbers will be removed.  Defaults to \code{TRUE}.
+#' @param toLower \code{TRUE} or \code{FALSE} indicating whether or not to coerce all of \code{text} to lowercase while tokenizing.  If \code{TRUE}, \code{text} will be coerced to lowercase.  Defaults to \code{TRUE}.
+#' @param stemWords \code{TRUE} or \code{FALSE} indicating whether or not to stem resulting tokens.  If \code{TRUE}, the outputted tokens will be tokenized using \code{SnowballC::wordStem()}.  Defaults to \code{TRUE}.
+#' @param rmStopWords \code{TRUE}, \code{FALSE}, or character vector of stopwords to remove from tokens. If \code{TRUE}, words in \code{tm::stopwords("SMART")} will be removed prior to stemming. If \code{FALSE}, no stopword removal will occur. If a character vector is passed, this vector will be used as the list of stopwords to be removed.  Defaults to \code{TRUE}.
+#' @return A list of dataframes.  The first element of the list returned is the \code{sentences} dataframe; this dataframe has columns \code{docId}, \code{sentenceId}, & \code{sentence} (the actual text of the sentence).  The second element of the list returned is the \code{tokens} dataframe; this dataframe has columns \code{docId}, \code{sentenceId}, & \code{token} (the actual text of the token).
+#' @examples
+#' sentenceTokenParse(c("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA."),
+#'                    docId=c("d1","d2"))
+#' @importFrom magrittr "%>%"
+
+#' @export
+sentenceTokenParse <- function(text, docId = "create", removePunc=TRUE, removeNum=TRUE, toLower=TRUE, stemWords=TRUE, rmStopWords=TRUE){
+  sentenceDf <- sentenceParse(text, docId=docId)
+
+  tokenDfList <- lapply(seq_along(sentenceDf$sentence), function(i) {
+    sentVec <- sentenceDf$sentence[i]
+    tokenList <- tokenize(text = sentVec, removePunc = removePunc, removeNum = removeNum, toLower = toLower, stemWords = stemWords, rmStopWords=rmStopWords)
+    subTokenDfList <- lapply(seq_along(tokenList), function(j) {
+      data.frame(docId=sentenceDf$docId[i], sentenceId=sentenceDf$sentenceId[i], token=tokenList[[j]], stringsAsFactors = FALSE)
+    })
+    dplyr::bind_rows(subTokenDfList)
+  })
+  tokenDf <- dplyr::bind_rows(tokenDfList) %>%
+    dplyr::filter(!is.na(token))
+  class(tokenDf) <- "data.frame"
+
+  list(sentences=sentenceDf, tokens=tokenDf)
+}
diff --git a/R/sentence_parser.R b/R/sentence_parser.R
new file mode 100644
index 0000000..e1f617e
--- /dev/null
+++ b/R/sentence_parser.R
@@ -0,0 +1,10 @@
+#' Utility to parse sentences from text
+
+#' @description Utility to parse sentences from text; created to have a central shared sentence parsing function
+#' @param text Character vector to be parsed into sentences
+#' @return A list with length equal to `length(text)`; list elements are character vectors of text parsed with sentence regex
+
+sentence_parser <- function(text) {
+  stringr::str_split(string = text, 
+                     pattern = stringr::regex("(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s"))
+}
diff --git a/R/tokenize.R b/R/tokenize.R
new file mode 100644
index 0000000..cf5e3fe
--- /dev/null
+++ b/R/tokenize.R
@@ -0,0 +1,69 @@
+#' Tokenize a character vector
+
+#' Parse the elements of a character vector into a list of cleaned tokens.
+#' @param text The character vector to be tokenized
+#' @param removePunc \code{TRUE} or \code{FALSE} indicating whether or not to remove punctuation from \code{text}.  If \code{TRUE}, puncuation will be removed.  Defaults to \code{TRUE}.
+#' @param removeNum \code{TRUE} or \code{FALSE} indicating whether or not to remove numbers from \code{text}.  If \code{TRUE}, numbers will be removed.  Defaults to \code{TRUE}.
+#' @param toLower \code{TRUE} or \code{FALSE} indicating whether or not to coerce all of \code{text} to lowercase.  If \code{TRUE}, \code{text} will be coerced to lowercase.  Defaults to \code{TRUE}.
+#' @param stemWords \code{TRUE} or \code{FALSE} indicating whether or not to stem resulting tokens.  If \code{TRUE}, the outputted tokens will be tokenized using \code{SnowballC::wordStem()}.  Defaults to \code{TRUE}.
+#' @param rmStopWords \code{TRUE}, \code{FALSE}, or character vector of stopwords to remove. If \code{TRUE}, words in \code{tm::stopwords("SMART")} will be removed prior to stemming. If \code{FALSE}, no stopword removal will occur. If a character vector is passed, this vector will be used as the list of stopwords to be removed.  Defaults to \code{TRUE}.
+#' @examples
+#' tokenize("Mr. Feeny said the test would be on Sat. At least I'm 99.9% sure that's what he said.")
+#' tokenize("Bill is trying to earn a Ph.D. in his field.", rmStopWords=FALSE)
+#' @importFrom magrittr "%>%"
+
+#' @export
+tokenize <- function(text, removePunc=TRUE, removeNum=TRUE, toLower=TRUE, stemWords=TRUE, rmStopWords=TRUE){
+  if(!is.character(text)) stop("text must be character")
+  if(length(text) < 1) stop("text must be at least length 1")
+  if(!is.logical(removePunc)) stop("removePunc must be logical")
+  if(length(removePunc) != 1) stop("removePunc must be length 1")
+  if(!is.logical(removeNum)) stop("removeNum must be logical")
+  if(length(removeNum) != 1) stop("removeNum must be length 1")
+  if(!is.logical(toLower)) stop("toLower must be logical")
+  if(length(toLower) != 1) stop("toLower must be length 1")
+  if(!is.logical(stemWords)) stop("stemWords must be logical")
+  if(length(stemWords) != 1) stop("stemWords must be length 1")
+  if(!is.logical(rmStopWords) & !is.character(rmStopWords)) stop("rmStopWords must be a logical or a character vector")
+  if(is.character(rmStopWords)) {
+    rmStopWordFlag <- TRUE
+    stopwords <- rmStopWords
+  } else if(is.logical(rmStopWords)) {
+    if(length(rmStopWords) != 1) stop("rmStopWords must be length 1 if passed as a logical")
+    if(rmStopWords) {
+      rmStopWordFlag <- TRUE
+      stopwords <- tm::stopwords("SMART")
+    } else {
+      rmStopWordFlag <- FALSE
+    }
+  }
+
+  if (removePunc) text <- gsub(x=text,pattern="[^[:alnum:] ]",replacement="")
+  if (removeNum) text <- gsub(x=text,pattern="([[:digit:]])",replacement="")
+  if (toLower) text <- tolower(text)
+
+  text <- text %>%
+    gsub(pattern="([^[:alnum:] ])",replacement=" \\1 ") %>% 
+    gsub(pattern="\\s+",replacement=" ") %>%
+    trimws() %>% 
+    stringr::str_split(pattern=" ")
+
+  if(rmStopWordFlag) text <- lapply(text, function(tokens) {
+    checkTokens <- tolower(tokens)
+    if (!removePunc) {
+      checkTokens <- gsub(x=checkTokens,pattern="[^[:alnum:] ]",replacement="")
+    }
+    
+    nonStopTok <- tokens[which(!checkTokens %in% stopwords)]
+    if(length(nonStopTok) == 0) NA_character_ else nonStopTok
+  })
+  if(stemWords) {
+    text <- lapply(text, SnowballC::wordStem)
+  }
+
+  tokenList <- lapply(text, function(tokens) {
+    goodTok <- tokens[which(trimws(tokens) != "")]
+    if(length(goodTok) == 0) NA_character_ else goodTok
+  })
+  tokenList
+}
diff --git a/R/unnest_sentences.R b/R/unnest_sentences.R
new file mode 100644
index 0000000..79fcaa3
--- /dev/null
+++ b/R/unnest_sentences.R
@@ -0,0 +1,57 @@
+#' Split a column of text into sentences
+
+#' @description Split a column of text into sentences
+#' @param tbl dataframe containing column of text to be split into sentences
+#' @param output name of column to be created to store parsed sentences
+#' @param input name of input column of text to be parsed into sentences
+#' @param output_id name of column to be created to store sentence ids
+#' @param drop whether original input column should get dropped
+#' @return A data.frame of parsed sentences and sentence ids
+#' @examples
+#' library(dplyr)
+#' 
+#' df <- dplyr::tibble(doc_id = 1:3, 
+#'                     text = c("Testing the system. Second sentence for you.", 
+#'                              "System testing the tidy documents df.", 
+#'                              "Documents will be parsed and lexranked."))
+#'
+#' unnest_sentences(df, sents, text)
+#' unnest_sentences_(df, "sents", "text")
+#' 
+#' df %>% 
+#'   unnest_sentences(sents, text)
+
+#' @export
+unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE) {
+  if(!is.data.frame(tbl)) stop("tbl must be a dataframe")
+  if(!(input %in% names(tbl))) stop("input column not found in tbl")
+  if(!is.character(tbl[[input]])) stop("input column must be character")
+  if(length(output_id) > 1) {
+    warning("only first element of output_id will be used")
+    output_id <- output_id[1]
+  }
+  if(!is.logical(drop)) stop("drop must be logical")
+  
+  text <- tbl[[input]]
+  parsed_sents <- sentence_parser(text)
+  sent_ids     <- lapply(parsed_sents, function(.x) 1:length(.x))
+  
+  if (drop) {
+    tbl[[input]] <- NULL
+  }
+  
+  tbl[[output_id]] <- sent_ids
+  tbl[[output]]    <- parsed_sents
+  
+  tidyr::unnest(tbl)
+}
+
+#' @rdname unnest_sentences_
+#' @export
+unnest_sentences <- function(tbl, output, input, output_id='sent_id', drop=TRUE) {
+  output_str <- as.character(substitute(output))
+  input_str  <- as.character(substitute(input))
+  out_id_str <- as.character(substitute(output_id))
+  
+  unnest_sentences_(tbl, output_str, input_str, out_id_str, drop)
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7538b50
--- /dev/null
+++ b/README.md
@@ -0,0 +1,213 @@
+# lexRankr: Extractive Text Summariztion in R
+
+[![Build Status](https://travis-ci.org/AdamSpannbauer/lexRankr.svg?branch=master)](https://travis-ci.org/AdamSpannbauer/lexRankr) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/AdamSpannbauer/lexRankr?branch=master&svg=true)](https://ci.appveyor.com/project/AdamSpannbauer/lexRankr) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/lexRankr)](https://CRAN.R-project.org/package=lexRankr) [![Coverage Status](https://img.shields.io/codecov/c/github/Ada [...]
+
+##Installation
+    
+    ##install from CRAN
+    install.packages("lexRankr")
+    
+    #install from this github repo
+    devtools::install_github("AdamSpannbauer/lexRankr")
+    
+
+
+##Overview
+lexRankr is an R implementation of the LexRank algorithm discussed by Güneş Erkan & Dragomir R. Radev in [LexRank: Graph-based Lexical Centrality as Salience in Text Summarization](http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html).  LexRank is designed to summarize a cluster of documents by proposing which sentences subsume the most information in that particular set of documents.  The algorithm may not perform well on a set of unclustered/unrelated set  [...]
+
+*note: the lexrank algorithm is designed to work on a cluster of documents. LexRank is built on the idea that a cluster of docs will focus on similar topics*
+
+*note: pairwise sentence similiarity is calculated for the entire set of documents passed to the function.  This can be a computationally instensive process (esp with a large set of documents)*
+
+##Basic Usage
+####lexRank in a tidy framework
+  ```
+  library(lexRankr)
+  library(dplyr)
+
+  df <- tibble(doc_id = 1:3, 
+               text = c("Testing the system. Second sentence for you.", 
+                        "System testing the tidy documents df.", 
+                        "Documents will be parsed and lexranked."))
+                        
+  df %>% 
+      unnest_sentences(sents, text) %>% 
+      bind_lexrank(sents, doc_id, level = 'sentences') %>% 
+      arrange(desc(lexrank))
+  ```
+####lexRank applied to a charcter vector of documents
+  ```
+    library(lexRankr)
+
+    text <- c("Testing the system. Second sentence for you.", 
+              "System testing the tidy documents df.", 
+              "Documents will be parsed and lexranked.")
+                        
+    lexRank(text)
+  ```
+  
+##Example with Twitter (using lexRank helper functions)
+    ```
+    library(jsonlite)
+    library(httr)
+    
+    library(lexRankr)
+    
+    #########################################################
+    #FUNCTION TO GET TEXT OF USERS LAST N TWEETS ON TIMELINE#
+    #########################################################
+    getUserTweets <- function(user, n, consKey, consSecret, token, tokenSecret) {
+    
+      #SET UP FOR API
+      auth <- oauth_app("twitter", key=consKey, secret=consSecret)
+      sig  <- sign_oauth1.0(auth, token=token, token_secret=tokenSecret)
+    
+      #INITIALIZE COUNTERS AND STORAGE
+      nLeft  <- n
+      i <- 0
+      tweetText <- character(0)
+      #LOOP UNTIL n IS MET
+      while (nLeft > 0) {
+        nToGet <- min(200, nLeft)
+        i <- i+1
+    
+        #SET MAX ID IF i > 1 (MAX ID WILL KEEP TWEETS FROM BEING DUPLICATED IN GET)
+        if (i==1) {
+          GETurl    <- paste0("https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=", user,"&count=", nToGet)
+        } else {
+          GETurl    <- paste0("https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=", user,"&count=",nToGet,"&max_id=",    max_id)
+        }
+    
+        #GET TWEETS
+        timelineRaw <- GET(GETurl, sig)
+    
+        #EXTRACT TWEET TEXT FROM GET
+        timelineContent <- content(timelineRaw)
+        jsonRaw <- toJSON(timelineContent)
+        jsonConv <- iconv(jsonRaw, "UTF-8", "ASCII", sub = "") #clean out emoji and other special chars
+        jsonConv <- gsub("\003", "", jsonConv) #special character (^C) not caught by above clean
+        jsonClean <- fromJSON(jsonConv)
+    
+        #APPEND TWEET TEXT
+        tweetText <- c(tweetText, unlist(jsonClean$text))
+    
+        #STORE MAX ID FOR USE IN NEXT GETurl
+        suppressWarnings(max_id <- min(unlist(jsonClean$id)))
+    
+        #UPDATE NUMBER OF TWEETS LEFT TO MEET n PARAMETER
+        nLeft <- nLeft-nToGet
+      }
+    
+      return(unique(tweetText))
+    }
+    
+    
+    ##################################################################################
+    #FUNCTION USING LEXRANKR TO FIND THE MOST REPRESENTATIVE TWEETS OF USERS TIMELINE#
+    ##################################################################################
+    tweetRankr <- function(tweetText, dropMentions=TRUE, dropHashtags=TRUE, n=5, returnTies=TRUE, printTweets = TRUE) {
+    
+      #store original tweet text
+      tweetTextOg <- tweetText
+    
+      #remove instances of @username from text
+      if (dropMentions) tweetText <- gsub("\\@\\w+","",tweetText)
+      #remove insances of #topic from text
+      if (dropHashtags) tweetText <- gsub("\\#\\w+","",tweetText)
+      #clean up any multiple spaces introduced by modifying text
+      tweetText <- trimws(gsub("\\s+", " ", tweetText))
+    
+      #parse text and create doc/sent Ids for each token
+      sentenceTokenList <- lexRankr::sentenceTokenParse(tweetText)
+      #store token data.frame from list output
+        #token data.frame has columns for the token, document Id, and sentence Id
+      tweetTokenDf <- sentenceTokenList$tokens
+    
+      #compute pairwise tweet similiarity
+        #using document id from token df instead of sentence id
+        #using docId will find most central tweets as opposed to most central sentences within tweets
+      similDf <- lexRankr::sentenceSimil(tweetTokenDf$docId, tweetTokenDf$token, tweetTokenDf$docId)
+    
+      #apply lexRank algorithm to return top n tweet ids
+      topTweetIdsDf <- lexRankr::lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, n = n, returnTies = returnTies)
+    
+      #subset tweet text variable to extract most central tweets according to lexrank
+      topTweets <- tweetTextOg[as.numeric(topTweetIdsDf$sentenceId)]
+    
+      if (printTweets) cat(paste(paste0(1:length(topTweets), ") ",topTweets), collapse="\n\n"))
+      invisible(return(topTweets))
+    }
+    
+    consKey     <- 'consumer key'
+    consSecret  <- 'consumer secret'
+    token       <- 'access token'
+    tokenSecret <- 'access token secret'
+    
+    ###################
+    # Hillary Clinton #
+    ###################
+    tweetTextHill <- getUserTweets(user="HillaryClinton", n=5000, consKey, consSecret, token, tokenSecret)
+    topTweetsHill <- tweetRankr(tweetTextHill)
+    ```
+    
+1) "Hillary Clinton must become the next president of the United States. @BernieSanders #DemsInPhilly
+
+2) "In this election, Im with her!" @FLOTUS on Hillary #DemsInPhilly
+
+3) We are better than this.
+
+4) "As your president, I will always have your back." Hillary
+
+5) She knows that love trumps hate. @POTUS on Hillary
+
+    ```
+    ################
+    # Donald Trump #
+    ################
+    tweetTextTrump <- getUserTweets(user="realDonaldTrump", n=5000, consKey, consSecret, token, tokenSecret)
+    topTweetsTrump <- tweetRankr(tweetTextTrump)
+    ```
+1) "@tcloer11: @realDonaldTrump Great job! Make America Great Again!"
+
+2) Wisconsin, we will MAKE AMERICA GREAT AGAIN!
+
+3) MAKE AMERICA GREAT AGAIN!
+
+4) MAKE AMERICA GREAT AGAIN! MAKE AMERICA SAFE AGAIN!
+
+5) AMERICA FIRST!
+
+    ```
+    ##################
+    # Bernie Sanders #
+    ##################
+    tweetTextBern <- getUserTweets(user="SenSanders", n=5000, consKey, consSecret, token, tokenSecret)
+    topTweetsBern <- tweetRankr(tweetTextBern)
+    ```
+1) Unless Congress stands up for the middle class that's getting stepped on by the billionaire class, soon there won't be aiddle class left.
+ 
+2) The current federal minimum wage of $7.25 an hour is a starvation wage and must be raised. The minimum wage must become aiving wage.
+
+3) Weve got to stand up to the fossil fuel industry and fight for legislation that transforms our energy system away from fossiluels.
+
+4) There is no justice when so few have so much and so many have so little.
+
+5) Health care is a right, not a privilege. Everyone in America should be able to access the health care they need regardless of their income.
+
+    ```
+    #############
+    # Rbloggers #
+    #############
+    tweetTextRblog <- getUserTweets(user="Rbloggers", n=5000, consKey, consSecret, token, tokenSecret)
+    topTweetsRblog <- tweetRankr(tweetTextRblog)
+    ```
+1) New R job: Data Scientist  Machine Learning https://t.co/YiWwXkmxmc #rstats #DataScience #jobs
+
+2) New R job: Principal Analysts x2, Senior Analyst, Analyst (@ Wellington ) http://t.co/5OLIDl51tw #rstats #jobs
+
+3) A Few Days of Python: Using R in Python http://t.co/28j8CAYThn #rstats
+
+4) Network visualization  part 4: 3D networks https://t.co/U6U53xG679 #rstats #DataScience
+
+5) Network visualization  part 4: 3D networks https://t.co/Y625xNNr03 #rstats #DataScience
+
diff --git a/build/vignette.rds b/build/vignette.rds
new file mode 100644
index 0000000..74a2bad
Binary files /dev/null and b/build/vignette.rds differ
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index e54c672..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-r-cran-lexrankr (0.4.0-1) unstable; urgency=medium
-
-  * Initial release (closes: #859507)
-
- -- Andreas Tille <tille at debian.org>  Tue, 04 Apr 2017 16:01:11 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 31a9f8e..0000000
--- a/debian/control
+++ /dev/null
@@ -1,37 +0,0 @@
-Source: r-cran-lexrankr
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: gnu-r
-Priority: optional
-Build-Depends: debhelper (>= 10),
-               dh-r,
-               r-base-dev,
-               r-cran-dplyr,
-               r-cran-tidyr,
-               r-cran-magrittr,
-               r-cran-stringr,
-               r-cran-igraph,
-               r-cran-tm,
-               r-cran-rcpp,
-               r-cran-snowballc
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/R/r-cran-lexrankr/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/R/r-cran-lexrankr/trunk/
-Homepage: https://cran.r-project.org/package=lexRankr
-
-Package: r-cran-lexrankr
-Architecture: any
-Depends: ${R:Depends},
-         ${shlibs:Depends},
-         ${misc:Depends}
-Recommends: ${R:Recommends}
-Suggests: ${R:Suggests}
-Description: extractive summarization of text with the LexRank algorithm
- An R implementation of the LexRank algorithm implementing stochastic
- graph-based method for computing relative importance of textual units
- for Natural Language Processing. We test the technique on the problem
- of Text Summarization (TS). Extractive TS relies on the concept of
- sentence salience to identify the most important sentences in a
- document or set of documents. Salience is typically defined in terms of
- the presence of particular important words or in terms of similarity to
- a centroid pseudo-sentence.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 65d7b18..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,33 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: lexRankr
-Upstream-Contact: Adam Spannbauer <spannbaueradam at gmail.com>
-Source: https://cran.r-project.org/package=lexRankr
-
-Files: *
-Copyright: Adam Spannbauer <spannbaueradam at gmail.com>
-License: MIT
-
-Files: debian/*
-Copyright: 2017 Andreas Tille <tille at debian.org>
-License: MIT
-
-License: MIT
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- .
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
- .
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index 67ce40b..0000000
--- a/debian/docs
+++ /dev/null
@@ -1,2 +0,0 @@
-debian/tests/run-unit-test
-tests
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 529c38a..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/make -f
-
-%:
-	dh $@ --buildsystem R
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/tests/control b/debian/tests/control
deleted file mode 100644
index 199f084..0000000
--- a/debian/tests/control
+++ /dev/null
@@ -1,9 +0,0 @@
-Tests: run-unit-test
-Depends: @, r-cran-testthat,
-Restrictions: allow-stderr
-
-Tests: vignette
-Depends: @
-Restrictions: allow-stderr
-
-
diff --git a/debian/tests/run-unit-test b/debian/tests/run-unit-test
deleted file mode 100644
index c97bea5..0000000
--- a/debian/tests/run-unit-test
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/sh -e
-
-pkgname=lexRankr
-debname=r-cran-lexrankr
-
-if [ "$ADTTMP" = "" ] ; then
-    ADTTMP=`mktemp -d /tmp/${debname}-test.XXXXXX`
-    trap "rm -rf $ADTTMP" 0 INT QUIT ABRT PIPE TERM
-fi
-cd $ADTTMP
-cp -a /usr/share/doc/$debname/tests/* $ADTTMP
-gunzip -r *
-for testfile in *.R; do
-    echo "BEGIN TEST $testfile"
-    LC_ALL=C R --no-save < $testfile
-done
-
diff --git a/debian/tests/vignette b/debian/tests/vignette
deleted file mode 100644
index fdf4c2b..0000000
--- a/debian/tests/vignette
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/sh -e
-
-for vignette in $(find vignettes -iname '*.rnw' -or -iname '*.rmd'); do
-    echo "BEGIN VIGNETTE $vignette"
-    LC_ALL=C R CMD Sweave $vignette
-done
-
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
deleted file mode 100644
index 5668794..0000000
--- a/debian/upstream/metadata
+++ /dev/null
@@ -1,10 +0,0 @@
-Reference:
-  Author: Güneş Erkan and Dragomir R. Radev
-  Title: "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization"
-  Journal: Journal of Artific Intelligence Research
-  Year: 2004
-  Volume: 22
-  Pages: 457-479
-  DOI: 10.1613/jair.1523
-  URL: http://jair.org/papers/paper1523.html
-  eprint: http://jair.org/media/1523/live-1523-2354-jair.pdf
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 6a3024f..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=4
-https://cran.r-project.org/src/contrib/lexRankr_([-\d.]*)\.tar\.gz
diff --git a/inst/doc/Analyzing_Twitter_with_LexRankr.html b/inst/doc/Analyzing_Twitter_with_LexRankr.html
new file mode 100644
index 0000000..e4c11f6
--- /dev/null
+++ b/inst/doc/Analyzing_Twitter_with_LexRankr.html
@@ -0,0 +1,268 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+
+<meta name="author" content="Adam Spannbauer" />
+
+<meta name="date" content="2017-03-01" />
+
+<title>Using lexRankr to find a user’s most representative tweets</title>
+
+
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+div.sourceCode { overflow-x: auto; }
+table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
+  margin: 0; padding: 0; vertical-align: baseline; border: none; }
+table.sourceCode { width: 100%; line-height: 100%; }
+td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
+td.sourceCode { padding-left: 5px; }
+code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
+code > span.dt { color: #902000; } /* DataType */
+code > span.dv { color: #40a070; } /* DecVal */
+code > span.bn { color: #40a070; } /* BaseN */
+code > span.fl { color: #40a070; } /* Float */
+code > span.ch { color: #4070a0; } /* Char */
+code > span.st { color: #4070a0; } /* String */
+code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
+code > span.ot { color: #007020; } /* Other */
+code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
+code > span.fu { color: #06287e; } /* Function */
+code > span.er { color: #ff0000; font-weight: bold; } /* Error */
+code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
+code > span.cn { color: #880000; } /* Constant */
+code > span.sc { color: #4070a0; } /* SpecialChar */
+code > span.vs { color: #4070a0; } /* VerbatimString */
+code > span.ss { color: #bb6688; } /* SpecialString */
+code > span.im { } /* Import */
+code > span.va { color: #19177c; } /* Variable */
+code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
+code > span.op { color: #666666; } /* Operator */
+code > span.bu { } /* BuiltIn */
+code > span.ex { } /* Extension */
+code > span.pp { color: #bc7a00; } /* Preprocessor */
+code > span.at { color: #7d9029; } /* Attribute */
+code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
+code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
+code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
+code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
+</style>
+
+
+
+<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20bot [...]
+
+</head>
+
+<body>
+
+
+
+
+<h1 class="title toc-ignore">Using lexRankr to find a user’s most representative tweets</h1>
+<h4 class="author"><em>Adam Spannbauer</em></h4>
+<h4 class="date"><em>2017-03-01</em></h4>
+
+
+
+<div id="packages-used" class="section level3">
+<h3>Packages Used</h3>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(lexRankr)
+<span class="kw">library</span>(tidyverse)
+<span class="kw">library</span>(stringr)
+<span class="kw">library</span>(httr)
+<span class="kw">library</span>(jsonlite)</code></pre></div>
+<p>In this document we get tweets from twitter using the twitter API and then analyze the tweets using lexRankr in order to find a user’s most representative tweets. If you don’t care about interacting with the twitter api you can <strong>jump to the <a href="#lexrank-analysis">lexrank analysis</a></strong>.</p>
+</div>
+<div id="get-user-tweets" class="section level2">
+<h2>Get user tweets</h2>
+<p>Before we can analyze tweets we’ll need some tweets to analyze. We’ll be using <a href="https://dev.twitter.com/overview/api">Twitter’s API</a>, and you’ll need to set up an account to get all keys needed for the api. The credentials needed for the api are: consumer key, consumer secret, token, and token secret. Below is how to set up your credentials to use the twitter api in this vignette.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># set api tokens/keys/secrets as environment vars</span>
+<span class="co"># Sys.setenv(cons_key     = 'my_cons_key')</span>
+<span class="co"># Sys.setenv(cons_secret  = 'my_cons_sec')</span>
+<span class="co"># Sys.setenv(token        = 'my_token')</span>
+<span class="co"># Sys.setenv(token_secret = 'my_token_sec')</span>
+
+<span class="co">#sign oauth</span>
+auth <-<span class="st"> </span>httr::<span class="kw">oauth_app</span>(<span class="st">"twitter"</span>, <span class="dt">key=</span><span class="kw">Sys.getenv</span>(<span class="st">"cons_key"</span>), <span class="dt">secret=</span><span class="kw">Sys.getenv</span>(<span class="st">"cons_secret"</span>))
+sig  <-<span class="st"> </span>httr::<span class="kw">sign_oauth1.0</span>(auth, <span class="dt">token=</span><span class="kw">Sys.getenv</span>(<span class="st">"token"</span>), <span class="dt">token_secret=</span><span class="kw">Sys.getenv</span>(<span class="st">"token_secret"</span>))</code></pre></div>
+<p>Now that we have our credentials set up, let’s write a function to get a user’s tweets from the api. Below the function <code>get_timeline_df</code> is defined. The function takes a user’s twitter handle, the number of tweets to get from the api, and the credentials we just set up. The function will return a dataframe with the columns <code>created_at, favorite_count, retweet_count, text</code>. The twitter api limits 200 tweets per get, so we will use a loop until we get the desired  [...]
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">get_timeline_df <-<span class="st"> </span>function(user, <span class="dt">n_tweets=</span><span class="dv">200</span>, oauth_sig) {
+  i <-<span class="st"> </span><span class="dv">0</span>
+  n_left <-<span class="st"> </span>n_tweets
+  timeline_df <-<span class="st"> </span><span class="ot">NULL</span>
+  <span class="co">#loop until n_tweets are all got</span>
+  while (n_left ><span class="st"> </span><span class="dv">0</span>) {
+    n_to_get <-<span class="st"> </span><span class="kw">min</span>(<span class="dv">200</span>, n_left)
+    i <-<span class="st"> </span>i<span class="dv">+1</span>
+    <span class="co">#incorporae max id in get_url (so as not to download same 200 tweets repeatedly)</span>
+    if (i==<span class="dv">1</span>) {
+      get_url <-<span class="st"> </span><span class="kw">paste0</span>(<span class="st">"https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name="</span>,
+                       user,<span class="st">"&count="</span>, n_to_get)
+    } else {
+      get_url <-<span class="st"> </span><span class="kw">paste0</span>(<span class="st">"https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name="</span>,
+                       user,<span class="st">"&count="</span>,n_to_get,<span class="st">"&max_id="</span>, max_id)
+    }
+    <span class="co">#GET tweets</span>
+    response <-<span class="st"> </span>httr::<span class="kw">GET</span>(get_url, oauth_sig)
+    <span class="co">#extract content and clean up</span>
+    response_content <-<span class="st"> </span>httr::<span class="kw">content</span>(response)
+    json_content     <-<span class="st"> </span>jsonlite::<span class="kw">toJSON</span>(response_content)
+    <span class="co">#clean out evil special chars</span>
+    json_conv <-<span class="st"> </span><span class="kw">iconv</span>(json_content, <span class="st">"UTF-8"</span>, <span class="st">"ASCII"</span>, <span class="dt">sub =</span> <span class="st">""</span>) %>%
+<span class="st">      </span>stringr::<span class="kw">str_replace_all</span>(<span class="st">"</span><span class="ch">\003</span><span class="st">"</span>, <span class="st">""</span>) <span class="co">#special character (^C) not caught by above clean</span>
+    timeline_list <-<span class="st"> </span>jsonlite::<span class="kw">fromJSON</span>(json_conv)
+    <span class="co">#extract desired fields</span>
+    fields_i_care_about <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"id"</span>, <span class="st">"text"</span>, <span class="st">"favorite_count"</span>, <span class="st">"retweet_count"</span>, <span class="st">"created_at"</span>)
+    timeline_df <-<span class="st"> </span>purrr::<span class="kw">map</span>(fields_i_care_about, ~<span class="kw">unlist</span>(timeline_list[[.x]])) %>%<span class="st"> </span>
+<span class="st">      </span>purrr::<span class="kw">set_names</span>(fields_i_care_about) %>%<span class="st"> </span>
+<span class="st">      </span>dplyr::<span class="kw">as_data_frame</span>() %>%<span class="st"> </span>
+<span class="st">      </span>dplyr::<span class="kw">bind_rows</span>(timeline_df) %>%<span class="st"> </span>
+<span class="st">      </span>dplyr::<span class="kw">distinct</span>()
+    <span class="co">#store min id (oldest tweet) to set as max id for next GET</span>
+    max_id <-<span class="st"> </span><span class="kw">min</span>(purrr::<span class="kw">map_dbl</span>(timeline_list$id, <span class="dv">1</span>))
+    <span class="co">#update number of tweets left</span>
+    n_left <-<span class="st"> </span>n_left-n_to_get
+  }
+  <span class="kw">return</span>(timeline_df)
+}</code></pre></div>
+<p>We can now use our function to gather a user’s tweets with the additional information of date-time, favorites, retweets. Lets use one of the most famous twitter accounts as of late: <span class="citation">[@realDonaldTrump]</span>(<a href="https://twitter.com/realDonaldTrump" class="uri">https://twitter.com/realDonaldTrump</a>).</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">tweets_df <-<span class="st"> </span><span class="kw">get_timeline_df</span>(<span class="st">"realDonaldTrump"</span>, <span class="dv">600</span>, sig) %>%<span class="st"> </span>
+<span class="st">    </span><span class="kw">mutate</span>(<span class="dt">text =</span> <span class="kw">str_replace_all</span>(text, <span class="st">"</span><span class="ch">\n</span><span class="st">"</span>, <span class="st">" "</span>)) <span class="co">#clean out newlines for display</span>
+
+tweets_df %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">head</span>(<span class="dt">n=</span><span class="dv">3</span>) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(text, created_at) %>%<span class="st"> </span>
+<span class="st">  </span>knitr::<span class="kw">kable</span>()</code></pre></div>
+<table>
+<thead>
+<tr class="header">
+<th align="left">text</th>
+<th align="left">created_at</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left">Yes, it is true - Carlos Slim, the great businessman from Mexico, called me about getting together for a meeting. We met, HE IS A GREAT GUY!</td>
+<td align="left">Tue Dec 20 20:27:57 +0000 2016</td>
+</tr>
+<tr class="even">
+<td align="left">especially how to get people, even with an unlimited budget, out to vote in the vital swing states ( and more). They focused on wrong states</td>
+<td align="left">Tue Dec 20 13:09:18 +0000 2016</td>
+</tr>
+<tr class="odd">
+<td align="left">Bill Clinton stated that I called him after the election. Wrong, he called me (with a very nice congratulations). He “doesn’t know much” …</td>
+<td align="left">Tue Dec 20 13:03:59 +0000 2016</td>
+</tr>
+</tbody>
+</table>
+</div>
+<div id="lexrank-analysis" class="section level2">
+<h2>Lexrank Analysis</h2>
+<p>We now have a dataframe that contains a column of tweets. This column of tweets will be the subject of the rest of the analysis. With the data in this format, we only need to call the <code>bind_lexrank</code> function to apply the lexrank algorithm to the tweets. The function will add a column of lexrank scores. The higher the lexrank score the more representative the tweet is of the tweets that we downloaded.</p>
+<p><em>note: typically one would parse documents into sentences before applying lexrank (</em><code>?unnest_sentences</code><em>); however we will equate tweets to sentences for this analysis</em></p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">tweets_df %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">bind_lexrank</span>(text, id, <span class="dt">level=</span><span class="st">"sentences"</span>) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">arrange</span>(<span class="kw">desc</span>(lexrank)) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">head</span>(<span class="dt">n=</span><span class="dv">5</span>) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(text, lexrank) %>%<span class="st"> </span>
+<span class="st">  </span>knitr::<span class="kw">kable</span>(<span class="dt">caption =</span> <span class="st">"Most Representative @realDonaldTrump Tweets"</span>)</code></pre></div>
+<table>
+<caption>Most Representative <span class="citation">@realDonaldTrump</span> Tweets</caption>
+<thead>
+<tr class="header">
+<th align="left">text</th>
+<th align="right">lexrank</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left">MAKE AMERICA GREAT AGAIN!</td>
+<td align="right">0.0087551</td>
+</tr>
+<tr class="even">
+<td align="left">Well, the New Year begins. We will, together, MAKE AMERICA GREAT AGAIN!</td>
+<td align="right">0.0085258</td>
+</tr>
+<tr class="odd">
+<td align="left">HAPPY PRESIDENTS DAY - MAKE AMERICA GREAT AGAIN!</td>
+<td align="right">0.0082361</td>
+</tr>
+<tr class="even">
+<td align="left">Happy Thanksgiving to everyone. We will, together, MAKE AMERICA GREAT AGAIN!</td>
+<td align="right">0.0060486</td>
+</tr>
+<tr class="odd">
+<td align="left">Hopefully, all supporters, and those who want to MAKE AMERICA GREAT AGAIN, will go to D.C. on January 20th. It will be a GREAT SHOW!</td>
+<td align="right">0.0059713</td>
+</tr>
+</tbody>
+</table>
+</div>
+<div id="repeating-tweetrank-analysis-for-other-users" class="section level2">
+<h2>Repeating tweetRank analysis for other users</h2>
+<p>With our <code>get_timeline_df</code> function we can easily repeat this analysis for other users. Below we repeat the whole analysis in a single magrittr pipeline.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">get_timeline_df</span>(<span class="st">"dog_rates"</span>, <span class="dv">600</span>, sig) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">mutate</span>(<span class="dt">text =</span> <span class="kw">str_replace_all</span>(text, <span class="st">"</span><span class="ch">\n</span><span class="st">"</span>, <span class="st">" "</span>)) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">bind_lexrank</span>(text, id, <span class="dt">level=</span><span class="st">"sentences"</span>) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">arrange</span>(<span class="kw">desc</span>(lexrank)) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">head</span>(<span class="dt">n=</span><span class="dv">5</span>) %>%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(text, lexrank) %>%<span class="st"> </span>
+<span class="st">  </span>knitr::<span class="kw">kable</span>(<span class="dt">caption =</span> <span class="st">"Most Representative @dog_rates Tweets"</span>)</code></pre></div>
+<table>
+<caption>Most Representative <span class="citation">@dog_rates</span> Tweets</caption>
+<thead>
+<tr class="header">
+<th align="left">text</th>
+<th align="right">lexrank</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left"><span class="citation">@Lin_Manuel</span> good day good dog</td>
+<td align="right">0.0167123</td>
+</tr>
+<tr class="even">
+<td align="left">Please keep loving</td>
+<td align="right">0.0099864</td>
+</tr>
+<tr class="odd">
+<td align="left">Here we h*ckin go</td>
+<td align="right">0.0085708</td>
+</tr>
+<tr class="even">
+<td align="left">Last day to get anything from our Valentine’s Collection by Valentine’s Day! Shop: <a href="https://t.co/MXljGLH3qY" class="uri">https://t.co/MXljGLH3qY</a> <a href="https://t.co/qFBCMytKMB" class="uri">https://t.co/qFBCMytKMB</a></td>
+<td align="right">0.0077583</td>
+</tr>
+<tr class="odd">
+<td align="left">Even if I tried (which I would never), I’d last like 17 seconds</td>
+<td align="right">0.0073899</td>
+</tr>
+</tbody>
+</table>
+<p></br></br></p>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+  (function () {
+    var script = document.createElement("script");
+    script.type = "text/javascript";
+    script.src  = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+    document.getElementsByTagName("head")[0].appendChild(script);
+  })();
+</script>
+
+</body>
+</html>
diff --git a/inst/doc/Analyzing_Twitter_with_LexRankr.html.asis b/inst/doc/Analyzing_Twitter_with_LexRankr.html.asis
new file mode 100644
index 0000000..698eea1
--- /dev/null
+++ b/inst/doc/Analyzing_Twitter_with_LexRankr.html.asis
@@ -0,0 +1,4 @@
+%\VignetteIndexEntry{Analyzing Twitter with LexRankr}
+%\VignetteEngine{R.rsp::asis}
+%\VignetteKeyword{twitter}
+%\VignetteKeyword{lexrankr}
diff --git a/man/bind_lexrank_.Rd b/man/bind_lexrank_.Rd
new file mode 100644
index 0000000..16477c3
--- /dev/null
+++ b/man/bind_lexrank_.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/bind_lexrank.R
+\name{bind_lexrank_}
+\alias{bind_lexrank}
+\alias{bind_lexrank_}
+\title{Bind lexrank scores to a dataframe of text}
+\usage{
+bind_lexrank_(tbl, text, doc_id, sent_id = NULL, level = c("sentences",
+  "tokens"), threshold = 0.2, usePageRank = TRUE, damping = 0.85,
+  continuous = FALSE, ...)
+
+bind_lexrank(tbl, text, doc_id, sent_id = NULL, level = c("sentences",
+  "tokens"), threshold = 0.2, usePageRank = TRUE, damping = 0.85,
+  continuous = FALSE, ...)
+}
+\arguments{
+\item{tbl}{dataframe containing column of sentences to be lexranked}
+
+\item{text}{name of column containing sentences or tokens to be lexranked}
+
+\item{doc_id}{name of column containing document ids corresponding to \code{text}}
+
+\item{sent_id}{Only needed if \code{level} is "tokens". name of column containing sentence ids corresponding to \code{text}}
+
+\item{level}{the parsed level of the text column to be lexranked.  i.e. is \code{text} a column of "sentences" or "tokens"?  The "tokens" level is provided to allow users to implement custom tokenization.  Note: even if the input \code{level} is "tokens" lexrank scores are assigned at the sentence level.}
+
+\item{threshold}{The minimum simililarity value a sentence pair must have to be represented in the graph where lexRank is calculated.}
+
+\item{usePageRank}{\code{TRUE} or \code{FALSE} indicating whether or not to use the page rank algorithm for ranking sentences.  If \code{FALSE}, a sentences unweighted centrality will be used as the rank.  Defaults to \code{TRUE}.}
+
+\item{damping}{The damping factor to be passed to page rank algorithm.  Ignored if \code{usePageRank} is \code{FALSE}.}
+
+\item{continuous}{\code{TRUE} or \code{FALSE} indicating whether or not to use continuous LexRank.  Only applies if \code{usePageRank==TRUE}.  If \code{TRUE}, \code{threshold} will be ignored and lexRank will be computed using a weighted graph representation of the sentences. Defaults to \code{FALSE}.}
+
+\item{...}{tokenizing options to be passed to lexRankr::tokenize.  Ignored if \code{level} is "sentences"}
+}
+\value{
+A dataframe with an additional column of lexrank scores (column is given name lexrank)
+}
+\description{
+Bind lexrank scores to a dataframe of sentences or to a dataframe of tokens with sentence ids
+}
+\examples{
+library(dplyr)
+
+df <- dplyr::tibble(doc_id = 1:3, 
+                    text = c("Testing the system. Second sentence for you.", 
+                             "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked."))
+
+df \%>\% 
+  unnest_sentences(sents, text) \%>\% 
+  bind_lexrank(sents, doc_id, level = 'sentences')
+
+df \%>\% 
+  unnest_sentences(sents, text) \%>\% 
+  bind_lexrank_("sents", "doc_id", level = 'sentences')
+
+df <- data.frame(doc_id  = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
+                             2, 2, 2, 3, 3, 3, 3, 3, 3), 
+                 sent_id = c(1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 
+                             1, 1, 1, 1, 1, 1, 1, 1, 1), 
+                 tokens = c("testing", "the", "system", "second", 
+                            "sentence", "for", "you", "system", 
+                            "testing", "the", "tidy", "documents", 
+                            "df", "documents", "will", "be", "parsed", 
+                            "and", "lexranked"),
+                 stringsAsFactors = FALSE)
+
+df \%>\% 
+  bind_lexrank(tokens, doc_id, sent_id, level = 'tokens')
+}
+
diff --git a/man/lexRank.Rd b/man/lexRank.Rd
new file mode 100644
index 0000000..e4b06e5
--- /dev/null
+++ b/man/lexRank.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lexRank.R
+\name{lexRank}
+\alias{lexRank}
+\title{Extractive text summarization with LexRank}
+\usage{
+lexRank(text, docId = "create", threshold = 0.2, n = 3,
+  returnTies = TRUE, usePageRank = TRUE, damping = 0.85,
+  continuous = FALSE, sentencesAsDocs = FALSE, removePunc = TRUE,
+  removeNum = TRUE, toLower = TRUE, stemWords = TRUE,
+  rmStopWords = TRUE, Verbose = TRUE)
+}
+\arguments{
+\item{text}{A character vector of documents to be cleaned and processed by the LexRank algorithm}
+
+\item{docId}{A vector of document IDs with length equal to the length of \code{text}.  If \code{docId == "create"} then doc IDs will be created as an index from 1 to \code{n}, where \code{n} is the length of \code{text}.}
+
+\item{threshold}{The minimum simil value a sentence pair must have to be represented in the graph where lexRank is calculated.}
+
+\item{n}{The number of sentences to return as the extractive summary.  The function will return the top \code{n} lexRanked sentences.  See \code{returnTies} for handling ties in lexRank.}
+
+\item{returnTies}{\code{TRUE} or \code{FALSE} indicating whether or not to return greater than \code{n} sentence IDs if there is a tie in lexRank.  If \code{TRUE}, the returned number of sentences will not be limited to \code{n}, but rather will return every sentece with a top 3 score.  If \code{FALSE}, the returned number of sentences will be \code{<=n}. Defaults to \code{TRUE}.}
+
+\item{usePageRank}{\code{TRUE} or \code{FALSE} indicating whether or not to use the page rank algorithm for ranking sentences.  If \code{FALSE}, a sentences unweighted centrality will be used as the rank.  Defaults to \code{TRUE}.}
+
+\item{damping}{The damping factor to be passed to page rank algorithm.  Ignored if \code{usePageRank} is \code{FALSE}.}
+
+\item{continuous}{\code{TRUE} or \code{FALSE} indicating whether or not to use continuous LexRank.  Only applies if \code{usePageRank==TRUE}.  If \code{TRUE}, \code{threshold} will be ignored and lexRank will be computed using a weighted graph representation of the sentences. Defaults to \code{FALSE}.}
+
+\item{sentencesAsDocs}{\code{TRUE} or \code{FALSE}, indicating whether or not to treat sentences as documents when calculating tfidf scores for similarity. If \code{TRUE}, inverse document frequency will be calculated as inverse sentence frequency (useful for single document extractive summarization).}
+
+\item{removePunc}{\code{TRUE} or \code{FALSE} indicating whether or not to remove punctuation from text while tokenizing.  If \code{TRUE}, puncuation will be removed.  Defaults to \code{TRUE}.}
+
+\item{removeNum}{\code{TRUE} or \code{FALSE} indicating whether or not to remove numbers from text while tokenizing.  If \code{TRUE}, numbers will be removed.  Defaults to \code{TRUE}.}
+
+\item{toLower}{\code{TRUE} or \code{FALSE} indicating whether or not to coerce all of text to lowercase while tokenizing.  If \code{TRUE}, \code{text} will be coerced to lowercase.  Defaults to \code{TRUE}.}
+
+\item{stemWords}{\code{TRUE} or \code{FALSE} indicating whether or not to stem resulting tokens.  If \code{TRUE}, the outputted tokens will be tokenized using \code{SnowballC::wordStem()}.  Defaults to \code{TRUE}.}
+
+\item{rmStopWords}{\code{TRUE}, \code{FALSE}, or character vector of stopwords to remove from tokens. If \code{TRUE}, words in \code{tm::stopwords("SMART")} will be removed prior to stemming. If \code{FALSE}, no stopword removal will occur. If a character vector is passed, this vector will be used as the list of stopwords to be removed.  Defaults to \code{TRUE}.}
+
+\item{Verbose}{\code{TRUE} or \code{FALSE} indicating whether or not to \code{cat} progress messages to the console while running.  Defaults to \code{TRUE}.}
+}
+\value{
+A 2 column dataframe with columns \code{sentenceId} and \code{value}. \code{sentence} contains the ids of the top \code{n} sentences in descending order by \code{value}. \code{value} contains page rank score (if \code{usePageRank==TRUE}) or degree centrality (if \code{usePageRank==FALSE}).
+}
+\description{
+Compute LexRanks from a vector of documents using the page rank algorithm or degree centrality the methods used to compute lexRank are discussed in "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization."
+}
+\examples{
+lexRank(c("This is a test.","Tests are fun.",
+"Do you think the exam will be hard?","Is an exam the same as a test?",
+"How many questions are going to be on the exam?"))
+}
+\references{
+\url{http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html}
+}
+
diff --git a/man/lexRankFromSimil.Rd b/man/lexRankFromSimil.Rd
new file mode 100644
index 0000000..1fc5ced
--- /dev/null
+++ b/man/lexRankFromSimil.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lexRankFromSimil.R
+\name{lexRankFromSimil}
+\alias{lexRankFromSimil}
+\title{Compute LexRanks from pairwise sentence similarities}
+\usage{
+lexRankFromSimil(s1, s2, simil, threshold = 0.2, n = 3, returnTies = TRUE,
+  usePageRank = TRUE, damping = 0.85, continuous = FALSE)
+}
+\arguments{
+\item{s1}{A character vector of sentence IDs corresponding to the \code{s2} and \code{simil} arguemants.}
+
+\item{s2}{A character vector of sentence IDs corresponding to the \code{s1} and \code{simil} arguemants.}
+
+\item{simil}{A numeric vector of similiarity values that represents the similiarity between the sentences represented by the IDs in \code{s1} and \code{s2}.}
+
+\item{threshold}{The minimum simil value a sentence pair must have to be represented in the graph where lexRank is calculated.}
+
+\item{n}{The number of sentences to return as the extractive summary.  The function will return the top \code{n} lexRanked sentences.  See \code{returnTies} for handling ties in lexRank.}
+
+\item{returnTies}{\code{TRUE} or \code{FALSE} indicating whether or not to return greater than \code{n} sentence IDs if there is a tie in lexRank.  If \code{TRUE}, the returned number of sentences will not be limited to \code{n}, but rather will return every sentece with a top 3 score.  If \code{FALSE}, the returned number of sentences will be \code{<=n}. Defaults to \code{TRUE}.}
+
+\item{usePageRank}{\code{TRUE} or \code{FALSE} indicating whether or not to use the page rank algorithm for ranking sentences.  If \code{FALSE}, a sentences unweighted centrality will be used as the rank.  Defaults to \code{TRUE}.}
+
+\item{damping}{The damping factor to be passed to page rank algorithm.  Ignored if \code{usePageRank} is \code{FALSE}.}
+
+\item{continuous}{\code{TRUE} or \code{FALSE} indicating whether or not to use continuous LexRank.  Only applies if \code{usePageRank==TRUE}.  If \code{TRUE}, \code{threshold} will be ignored and lexRank will be computed using a weighted graph representation of the sentences. Defaults to \code{FALSE}.}
+}
+\value{
+A 2 column dataframe with columns \code{sentenceId} and \code{value}. \code{sentenceId} contains the ids of the top \code{n} sentences in descending order by \code{value}. \code{value} contains page rank score (if \code{usePageRank==TRUE}) or degree centrality (if \code{usePageRank==FALSE}).
+}
+\description{
+Compute LexRanks from sentence pair similarities using the page rank algorithm or degree centrality the methods used to compute lexRank are discussed in "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization."
+}
+\examples{
+lexRankFromSimil(s1=c("d1_1","d1_1","d1_2"), s2=c("d1_2","d2_1","d2_1"), simil=c(.01,.03,.5))
+}
+\references{
+\url{http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html}
+}
+
diff --git a/man/sentenceParse.Rd b/man/sentenceParse.Rd
new file mode 100644
index 0000000..f276c68
--- /dev/null
+++ b/man/sentenceParse.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sentenceParse.R
+\name{sentenceParse}
+\alias{sentenceParse}
+\title{Parse text into sentences}
+\usage{
+sentenceParse(text, docId = "create")
+}
+\arguments{
+\item{text}{Character vector to be parsed into sentences}
+
+\item{docId}{A vector of document IDs with length equal to the length of \code{text}.  If \code{docId == "create"} then doc IDs will be created as an index from 1 to \code{n}, where \code{n} is the length of \code{text}.}
+}
+\value{
+A data frame with 3 columns and \code{n} rows, where \code{n} is the number of sentences found by the routine.  Column 1: \code{docId} document id for the sentence. Column 2: \code{sentenceId} sentence id for the sentence.  Column 3: \code{sentence} the sentences found in the routine.
+}
+\description{
+Parse the elements of a character vector into a dataframe of sentences with additional identifiers.
+}
+\examples{
+sentenceParse("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA.")
+sentenceParse(c("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA."),
+               docId=c("d1","d2"))
+}
+
diff --git a/man/sentenceSimil.Rd b/man/sentenceSimil.Rd
new file mode 100644
index 0000000..a10e77c
--- /dev/null
+++ b/man/sentenceSimil.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sentenceSimil.R
+\name{sentenceSimil}
+\alias{sentenceSimil}
+\title{Compute distance between sentences}
+\usage{
+sentenceSimil(sentenceId, token, docId = NULL, sentencesAsDocs = FALSE)
+}
+\arguments{
+\item{sentenceId}{A character vector of sentence IDs corresponding to the \code{docId} and \code{token} arguemants.}
+
+\item{token}{A character vector of tokens corresponding to the \code{docId} and \code{sentenceId} arguemants.}
+
+\item{docId}{A character vector of document IDs corresponding to the \code{sentenceId} and \code{token} arguemants.  Can be \code{NULL} if \code{sentencesAsDocs} is \code{TRUE}.}
+
+\item{sentencesAsDocs}{\code{TRUE} or \code{FALSE}, indicating whether or not to treat sentences as documents when calculating tfidf scores. If \code{TRUE}, inverse document frequency will be calculated as inverse sentence frequency (useful for single document extractive summarization)}
+}
+\value{
+A 3 column dataframe of pairwise distances between sentences. Columns: \code{sent1} (sentence id), \code{sent2} (sentence id), & \code{dist} (distance between \code{sent1} and \code{sent2}).
+}
+\description{
+Compute distance between sentences using modified idf cosine distance from "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization".  Output can be used as input to \code{\link{lexRankFromSimil}}.
+}
+\examples{
+sentenceSimil(docId=c("d1","d1","d2","d2"),
+               sentenceId=c("d1_1","d1_1","d2_1","d2_1"),
+               token=c("i", "ran", "jane", "ran"))
+}
+\references{
+\url{http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html}
+}
+
diff --git a/man/sentenceTokenParse.Rd b/man/sentenceTokenParse.Rd
new file mode 100644
index 0000000..4f091b8
--- /dev/null
+++ b/man/sentenceTokenParse.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sentenceTokenParse.R
+\name{sentenceTokenParse}
+\alias{sentenceTokenParse}
+\title{Parse text into sentences and tokens}
+\usage{
+sentenceTokenParse(text, docId = "create", removePunc = TRUE,
+  removeNum = TRUE, toLower = TRUE, stemWords = TRUE,
+  rmStopWords = TRUE)
+}
+\arguments{
+\item{text}{A character vector of documents to be parsed into sentences and tokenized.}
+
+\item{docId}{A character vector of document Ids the same length as \code{text}.  If \code{docId=="create"} document Ids will be created.}
+
+\item{removePunc}{\code{TRUE} or \code{FALSE} indicating whether or not to remove punctuation from \code{text} while tokenizing.  If \code{TRUE}, puncuation will be removed.  Defaults to \code{TRUE}.}
+
+\item{removeNum}{\code{TRUE} or \code{FALSE} indicating whether or not to remove numbers from \code{text} while tokenizing.  If \code{TRUE}, numbers will be removed.  Defaults to \code{TRUE}.}
+
+\item{toLower}{\code{TRUE} or \code{FALSE} indicating whether or not to coerce all of \code{text} to lowercase while tokenizing.  If \code{TRUE}, \code{text} will be coerced to lowercase.  Defaults to \code{TRUE}.}
+
+\item{stemWords}{\code{TRUE} or \code{FALSE} indicating whether or not to stem resulting tokens.  If \code{TRUE}, the outputted tokens will be tokenized using \code{SnowballC::wordStem()}.  Defaults to \code{TRUE}.}
+
+\item{rmStopWords}{\code{TRUE}, \code{FALSE}, or character vector of stopwords to remove from tokens. If \code{TRUE}, words in \code{tm::stopwords("SMART")} will be removed prior to stemming. If \code{FALSE}, no stopword removal will occur. If a character vector is passed, this vector will be used as the list of stopwords to be removed.  Defaults to \code{TRUE}.}
+}
+\value{
+A list of dataframes.  The first element of the list returned is the \code{sentences} dataframe; this dataframe has columns \code{docId}, \code{sentenceId}, & \code{sentence} (the actual text of the sentence).  The second element of the list returned is the \code{tokens} dataframe; this dataframe has columns \code{docId}, \code{sentenceId}, & \code{token} (the actual text of the token).
+}
+\description{
+Parse a character vector of documents into into both sentences and a clean vector of tokens.  The resulting output includes IDs for document and sentence for use in other \code{lexRank} functions.
+}
+\examples{
+sentenceTokenParse(c("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA."),
+                   docId=c("d1","d2"))
+}
+
diff --git a/man/sentence_parser.Rd b/man/sentence_parser.Rd
new file mode 100644
index 0000000..2bba98f
--- /dev/null
+++ b/man/sentence_parser.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sentence_parser.R
+\name{sentence_parser}
+\alias{sentence_parser}
+\title{Utility to parse sentences from text}
+\usage{
+sentence_parser(text)
+}
+\arguments{
+\item{text}{Character vector to be parsed into sentences}
+}
+\value{
+A list with length equal to `length(text)`; list elements are character vectors of text parsed with sentence regex
+}
+\description{
+Utility to parse sentences from text; created to have a central shared sentence parsing function
+}
+
diff --git a/man/tokenize.Rd b/man/tokenize.Rd
new file mode 100644
index 0000000..5ece152
--- /dev/null
+++ b/man/tokenize.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tokenize.R
+\name{tokenize}
+\alias{tokenize}
+\title{Tokenize a character vector
+Parse the elements of a character vector into a list of cleaned tokens.}
+\usage{
+tokenize(text, removePunc = TRUE, removeNum = TRUE, toLower = TRUE,
+  stemWords = TRUE, rmStopWords = TRUE)
+}
+\arguments{
+\item{text}{The character vector to be tokenized}
+
+\item{removePunc}{\code{TRUE} or \code{FALSE} indicating whether or not to remove punctuation from \code{text}.  If \code{TRUE}, puncuation will be removed.  Defaults to \code{TRUE}.}
+
+\item{removeNum}{\code{TRUE} or \code{FALSE} indicating whether or not to remove numbers from \code{text}.  If \code{TRUE}, numbers will be removed.  Defaults to \code{TRUE}.}
+
+\item{toLower}{\code{TRUE} or \code{FALSE} indicating whether or not to coerce all of \code{text} to lowercase.  If \code{TRUE}, \code{text} will be coerced to lowercase.  Defaults to \code{TRUE}.}
+
+\item{stemWords}{\code{TRUE} or \code{FALSE} indicating whether or not to stem resulting tokens.  If \code{TRUE}, the outputted tokens will be tokenized using \code{SnowballC::wordStem()}.  Defaults to \code{TRUE}.}
+
+\item{rmStopWords}{\code{TRUE}, \code{FALSE}, or character vector of stopwords to remove. If \code{TRUE}, words in \code{tm::stopwords("SMART")} will be removed prior to stemming. If \code{FALSE}, no stopword removal will occur. If a character vector is passed, this vector will be used as the list of stopwords to be removed.  Defaults to \code{TRUE}.}
+}
+\description{
+Tokenize a character vector
+Parse the elements of a character vector into a list of cleaned tokens.
+}
+\examples{
+tokenize("Mr. Feeny said the test would be on Sat. At least I'm 99.9\% sure that's what he said.")
+tokenize("Bill is trying to earn a Ph.D. in his field.", rmStopWords=FALSE)
+}
+
diff --git a/man/unnest_sentences_.Rd b/man/unnest_sentences_.Rd
new file mode 100644
index 0000000..b04180f
--- /dev/null
+++ b/man/unnest_sentences_.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/unnest_sentences.R
+\name{unnest_sentences_}
+\alias{unnest_sentences}
+\alias{unnest_sentences_}
+\title{Split a column of text into sentences}
+\usage{
+unnest_sentences_(tbl, output, input, output_id = "sent_id", drop = TRUE)
+
+unnest_sentences(tbl, output, input, output_id = "sent_id", drop = TRUE)
+}
+\arguments{
+\item{tbl}{dataframe containing column of text to be split into sentences}
+
+\item{output}{name of column to be created to store parsed sentences}
+
+\item{input}{name of input column of text to be parsed into sentences}
+
+\item{output_id}{name of column to be created to store sentence ids}
+
+\item{drop}{whether original input column should get dropped}
+}
+\value{
+A data.frame of parsed sentences and sentence ids
+}
+\description{
+Split a column of text into sentences
+}
+\examples{
+library(dplyr)
+
+df <- dplyr::tibble(doc_id = 1:3, 
+                    text = c("Testing the system. Second sentence for you.", 
+                             "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked."))
+
+unnest_sentences(df, sents, text)
+unnest_sentences_(df, "sents", "text")
+
+df \%>\% 
+  unnest_sentences(sents, text)
+}
+
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
new file mode 100644
index 0000000..3d3c46f
--- /dev/null
+++ b/src/RcppExports.cpp
@@ -0,0 +1,18 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// idfCosineSimil
+NumericVector idfCosineSimil(NumericMatrix mat);
+RcppExport SEXP lexRankr_idfCosineSimil(SEXP matSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< NumericMatrix >::type mat(matSEXP);
+    rcpp_result_gen = Rcpp::wrap(idfCosineSimil(mat));
+    return rcpp_result_gen;
+END_RCPP
+}
diff --git a/src/idfCosineSimil.cpp b/src/idfCosineSimil.cpp
new file mode 100644
index 0000000..cd9da3c
--- /dev/null
+++ b/src/idfCosineSimil.cpp
@@ -0,0 +1,39 @@
+#include <Rcpp.h>
+using namespace Rcpp;
+
+double idfCosineSimilVector(NumericVector x, NumericVector y) {
+  int n=x.size();
+  double numerator=0;
+  double denomenatorX=0;
+  double denomenatorY=0;
+  double result;
+
+  for (int i = 0; i<n; ++i) {
+    numerator += x[i]*y[i];
+    denomenatorX += x[i]*x[i];
+    denomenatorY += y[i]*y[i];
+  }
+
+  result = numerator/(sqrt(denomenatorX)*sqrt(denomenatorY));
+
+  return result;
+}
+
+// [[Rcpp::export]]
+NumericVector idfCosineSimil(NumericMatrix mat) {
+  int n = mat.nrow();
+  int nChoose2 = n*(n-1)/2;
+  NumericVector result(nChoose2);
+  int resInd = 0;
+
+  for (int i = 0; i<n; ++i) {
+    for (int j = i+1; j<n; ++j){
+      if(resInd % 10000 == 0) Rcpp::checkUserInterrupt();
+      result[resInd] = idfCosineSimilVector(mat(i,_), mat(j,_));
+      ++resInd;
+    }
+  }
+
+  return result;
+
+}
diff --git a/src/register_routines.c b/src/register_routines.c
new file mode 100644
index 0000000..67f3561
--- /dev/null
+++ b/src/register_routines.c
@@ -0,0 +1,22 @@
+#include <R.h>
+#include <Rinternals.h>
+#include <stdlib.h> // for NULL
+#include <R_ext/Rdynload.h>
+
+/* FIXME: 
+  Check these declarations against the C/Fortran source code.
+*/
+  
+  /* .Call calls */
+  extern SEXP lexRankr_idfCosineSimil(SEXP);
+
+static const R_CallMethodDef CallEntries[] = {
+  {"lexRankr_idfCosineSimil", (DL_FUNC) &lexRankr_idfCosineSimil, 1},
+  {NULL, NULL, 0}
+};
+
+void R_init_lexRankr(DllInfo *dll)
+{
+  R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+  R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..f4594d8
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(lexRankr)
+
+test_check("lexRankr")
diff --git a/tests/testthat/test-bind_lexrank.R b/tests/testthat/test-bind_lexrank.R
new file mode 100644
index 0000000..83a7538
--- /dev/null
+++ b/tests/testthat/test-bind_lexrank.R
@@ -0,0 +1,124 @@
+context("bind_lexrank")
+
+# test output str --------------------------------------------------------
+test_that("correct ouput class and str", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  test_result <- df %>% 
+    unnest_sentences(sents, text) %>% 
+    bind_lexrank(sents, doc_id, level = 'sentences')
+  
+  expect_equal(dim(test_result), c(4,4))
+  expect_true(is.data.frame(test_result))
+  expect_equal(names(test_result), c("doc_id","sent_id","sents","lexrank"))
+  
+  test_result <- df %>% 
+    unnest_sentences(sents, text, drop=FALSE) %>% 
+    bind_lexrank(sents,doc_id, level = 'sentences')
+  
+  expect_equal(dim(test_result), c(4,5))
+  expect_equal(names(test_result), c("doc_id","text","sent_id","sents","lexrank"))
+  
+  df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
+                   sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
+                   sents = c("Testing the system.", "Testing the system.", "Testing the system.", 
+                             "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
+                   tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
+                              "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
+                   stringsAsFactors = FALSE)
+  
+  test_result <- df %>% 
+    bind_lexrank(tokens, doc_id, sent_id, "tokens")
+  
+  expect_equal(dim(test_result), c(19,5))
+  expect_equal(names(test_result), c("doc_id","sent_id","sents","tokens","lexrank"))
+})
+
+# test bad input -------------------------------------------------------
+test_that("test input checking", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE) %>% 
+    unnest_sentences(sents, text)
+  
+  expect_error(bind_lexrank(df, sents, fake))
+  expect_error(bind_lexrank(NULL, sents, doc_id))
+  expect_error(bind_lexrank(df, sents, doc_id, level="fake"))
+  expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens")))
+  
+  df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
+                   sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
+                   sents = c("Testing the system.", "Testing the system.", "Testing the system.", 
+                             "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
+                   tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
+                              "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
+                   stringsAsFactors = FALSE)
+  
+  expect_error(bind_lexrank(df, tokens, doc_id, fake, level="tokens"))
+  expect_error(bind_lexrank(df, tokens, doc_id, level="tokens"))
+  expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences")))
+})
+
+# test output val ------------------------------------------------------
+test_that("output value", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE) %>% 
+    unnest_sentences(sents, text)
+  
+  test_result     <- bind_lexrank(df, sents, doc_id, level="sentences")
+  expected_result <- data.frame(doc_id = c(1L, 1L, 2L, 3L), 
+                                sent_id = c(1L, 2L, 1L, 1L), 
+                                sents = c("Testing the system.", "Second sentence for you.", 
+                                          "System testing the tidy documents df.", "Documents will be parsed and lexranked."), 
+                                lexrank = c(0.5, NA, 0.5, NA),
+                                stringsAsFactors = FALSE)
+  
+  expect_equal(test_result, expected_result)
+  
+  df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
+                   sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
+                   sents = c("Testing the system.", "Testing the system.", "Testing the system.", 
+                             "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
+                   tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
+                              "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
+                   stringsAsFactors = FALSE)
+  
+  test_result     <- bind_lexrank(df, tokens, doc_id, sent_id, level="sentences") 
+  test_result$lexrank <- round(test_result$lexrank, 5)
+  expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
+                                sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
+                                sents = c("Testing the system.", "Testing the system.", "Testing the system.", 
+                                          "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", 
+                                          "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                                          "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                                          "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", 
+                                          "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
+                                tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
+                                           "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
+                                lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA),
+                                stringsAsFactors = FALSE)
+  
+  expect_equal(test_result, expected_result)
+})
+
diff --git a/tests/testthat/test-bind_lexrank_.R b/tests/testthat/test-bind_lexrank_.R
new file mode 100644
index 0000000..7bbf217
--- /dev/null
+++ b/tests/testthat/test-bind_lexrank_.R
@@ -0,0 +1,124 @@
+context("bind_lexrank_")
+
+# test output str --------------------------------------------------------
+test_that("correct ouput class and str", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  test_result <- df %>% 
+    unnest_sentences(sents, text) %>% 
+    bind_lexrank_("sents", "doc_id", level = 'sentences')
+  
+  expect_equal(dim(test_result), c(4,4))
+  expect_true(is.data.frame(test_result))
+  expect_equal(names(test_result), c("doc_id","sent_id","sents","lexrank"))
+  
+  test_result <- df %>% 
+    unnest_sentences(sents, text, drop=FALSE) %>% 
+    bind_lexrank_("sents", "doc_id", level = 'sentences')
+  
+  expect_equal(dim(test_result), c(4,5))
+  expect_equal(names(test_result), c("doc_id","text","sent_id","sents","lexrank"))
+  
+  df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
+                   sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
+                   sents = c("Testing the system.", "Testing the system.", "Testing the system.", 
+                             "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
+                   tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
+                              "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
+                   stringsAsFactors = FALSE)
+  
+  test_result <- df %>% 
+    bind_lexrank_("tokens", "doc_id", "sent_id", "tokens")
+  
+  expect_equal(dim(test_result), c(19,5))
+  expect_equal(names(test_result), c("doc_id","sent_id","sents","tokens","lexrank"))
+})
+
+# test bad input -------------------------------------------------------
+test_that("test input checking", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE) %>% 
+    unnest_sentences(sents, text)
+  
+  expect_error(bind_lexrank_(df, "sents", "fake"))
+  expect_error(bind_lexrank_(NULL, "sents", "doc_id"))
+  expect_error(bind_lexrank_(df, "sents", "doc_id", level="fake"))
+  expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens")))
+  
+  df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
+                   sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
+                   sents = c("Testing the system.", "Testing the system.", "Testing the system.", 
+                             "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
+                   tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
+                              "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
+                   stringsAsFactors = FALSE)
+  
+  expect_error(bind_lexrank_(df, "tokens", "doc_id", "fake", level="tokens"))
+  expect_error(bind_lexrank_(df, "tokens", "doc_id", level="tokens"))
+  expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences")))
+})
+
+# test output val ------------------------------------------------------
+test_that("output value", {
+  df <- data.frame(doc_id = 1:3, 
+                    text = c("Testing the system. Second sentence for you.", 
+                             "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked."),
+                    stringsAsFactors = FALSE) %>% 
+    unnest_sentences(sents, text)
+  
+  test_result     <- bind_lexrank_(df, "sents", "doc_id", level="sentences")
+  expected_result <- data.frame(doc_id = c(1L, 1L, 2L, 3L), 
+                                sent_id = c(1L, 2L, 1L, 1L), 
+                                sents = c("Testing the system.", "Second sentence for you.", 
+                                          "System testing the tidy documents df.", "Documents will be parsed and lexranked."), 
+                                lexrank = c(0.5, NA, 0.5, NA),
+                                stringsAsFactors = FALSE)
+  
+  expect_equal(test_result, expected_result)
+  
+  df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
+                   sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
+                   sents = c("Testing the system.", "Testing the system.", "Testing the system.", 
+                             "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", 
+                             "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
+                   tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
+                              "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
+                   stringsAsFactors = FALSE)
+  
+  test_result     <- bind_lexrank_(df, "tokens", "doc_id", "sent_id", level="sentences")
+  test_result$lexrank <- round(test_result$lexrank, 5)
+  expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
+                                sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
+                                sents = c("Testing the system.", "Testing the system.", "Testing the system.", 
+                                          "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", "Second sentence for you.", 
+                                          "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                                          "System testing the tidy documents df.", "System testing the tidy documents df.", "System testing the tidy documents df.", 
+                                          "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", 
+                                          "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
+                                tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
+                                           "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
+                                lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA),
+                                stringsAsFactors = FALSE)
+  
+  expect_equal(test_result, expected_result)
+})
+
diff --git a/tests/testthat/test-idfCosine.R b/tests/testthat/test-idfCosine.R
new file mode 100644
index 0000000..c5db4f5
--- /dev/null
+++ b/tests/testthat/test-idfCosine.R
@@ -0,0 +1,45 @@
+context("lexRankr:::idfCosineSimil")
+
+# test bad inputs ---------------------------------------
+test_that("bad inputs to idf cosine", {
+  expect_error(lexRankr:::idfCosineSimil(NULL))
+  
+  badMat <- matrix(c("a","b","c","d"), nrow=2)
+  expect_error(lexRankr:::idfCosineSimil(badMat))
+})
+
+# test object out str and class ---------------------------------------
+test_that("object out str and class", {
+  testMat <- matrix(runif(9, min = .01, max = 1), nrow=3)
+  testResult <- lexRankr:::idfCosineSimil(testMat)
+  
+  expect_equal(class(testResult), "numeric")
+  expect_equal(length(testResult), 3)
+})
+
+# test object out value
+test_that("object out value", {
+  testMat <- matrix(c(1,0,0,0,1,0,0,0,1), nrow=3)
+  expect_equal(lexRankr:::idfCosineSimil(testMat), c(0,0,0))
+  
+  testMat <- matrix(c(0,0,0,0,0,0,0,0,0), nrow=3)
+  expect_equal(lexRankr:::idfCosineSimil(testMat), c(NaN,NaN,NaN))
+
+  testMat <- matrix(c(1,1,1,1,1,1,1,1,1), nrow=3)
+  expect_equal(lexRankr:::idfCosineSimil(testMat), c(1,1,1))
+  
+  testMat <- matrix(runif(9, min = .01, max = 1), nrow=3)
+  rcppIdf <- round(lexRankr:::idfCosineSimil(testMat), 10)
+  #pure r version comparison
+  idfCosine <- function(x,y) {
+    res <- sum(x*y)/(sqrt(sum(x^2))*sqrt(sum(y^2)))
+    return(round(res, 10))
+  }
+  
+  elem1 <- idfCosine(testMat[1,], testMat[2,])
+  elem2 <- idfCosine(testMat[1,], testMat[3,])
+  elem3 <- idfCosine(testMat[2,], testMat[3,])
+  
+  expect_equal(rcppIdf, c(elem1, elem2, elem3))
+})
+
diff --git a/tests/testthat/test-lexRank.R b/tests/testthat/test-lexRank.R
new file mode 100644
index 0000000..8d1905b
--- /dev/null
+++ b/tests/testthat/test-lexRank.R
@@ -0,0 +1,40 @@
+context("lexRank")
+
+# test object out str and class ---------------------------------------
+test_that("object out str and class", {
+  testDocs <- c("Testing 1, 2, 3.", 
+                "Is everything working as expected in my test?",
+                "Is it working?")
+  testResult <- lexRank(testDocs, Verbose = FALSE)
+  
+  expect_equal(class(testResult), "data.frame")
+  expect_equal(names(testResult), c("docId","sentenceId", "sentence","value"))
+  expect_true(is.character(testResult$sentenceId))
+  expect_true(is.character(testResult$sentence))
+  expect_true(is.numeric(testResult$value))
+})
+
+# test bad inputs ---------------------------------------
+test_that("bad inputs", {
+  expect_error(lexRank(FALSE, Verbose = FALSE))
+  expect_error(lexRank(NULL, Verbose = FALSE))
+})
+
+# test object out value
+test_that("object out value", {
+  testDocs <- c("Testing 1, 2, 3.", 
+                "Is everything working as expected in my test?",
+                "Is it working?")
+  
+  testResult <- lexRank(testDocs, Verbose = FALSE) %>% 
+    dplyr::mutate(value = round(value, 5))
+  
+  expectedResult <- data.frame(docId = c(2L, 1L, 3L),
+                               sentenceId = c("2_1", "1_1", "3_1"),
+                               sentence = c("Is everything working as expected in my test?", 
+                                            "Testing 1, 2, 3.", "Is it working?"),
+                               value = c(0.48649, 0.25676, 0.25676),
+                               stringsAsFactors = FALSE)
+  
+  expect_identical(testResult, expectedResult)
+})
diff --git a/tests/testthat/test-lexRankFromSimil.R b/tests/testthat/test-lexRankFromSimil.R
new file mode 100644
index 0000000..f2762de
--- /dev/null
+++ b/tests/testthat/test-lexRankFromSimil.R
@@ -0,0 +1,78 @@
+context("lexRankFromSimil")
+
+# test object out str and class ---------------------------------------
+test_that("object out str and class", {
+  testDocs <- c("Testing 1, 2, 3.", 
+                "Is everything working as expected in my test?",
+                "Is it working?")
+  tokenDf <- sentenceTokenParse(testDocs)$tokens
+  
+  similDf <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                           token = tokenDf$token,
+                           docId = tokenDf$docId)
+  
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal)
+  
+  expect_equal(class(testResult), "data.frame")
+  expect_equal(names(testResult), c("sentenceId", "value"))
+  expect_true(is.character(testResult$sentenceId))
+  expect_true(is.numeric(testResult$value))
+})
+
+# test bad inputs ---------------------------------------
+test_that("bad inputs", {
+  testDocs <- c("Testing 1, 2, 3.", 
+                "Is everything working as expected in my test?",
+                "Is it working?")
+  tokenDf <- sentenceTokenParse(testDocs)$tokens
+  
+  similDf <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                           token = tokenDf$token,
+                           docId = tokenDf$docId)
+  
+  expect_error(lexRankFromSimil(NULL, similDf$sent2, similDf$similVal))
+  expect_error(lexRankFromSimil(c(1,2), similDf$sent2, similDf$similVal))
+  expect_error(lexRankFromSimil(similDf$sent1, similDf$sent2, c("a","b","c")))
+  expect_error(lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, threshold = NULL))
+  expect_error(lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, damping = NULL))
+  
+})
+
+# test object out value
+test_that("object out value", {
+  testDocs <- c("Testing 1, 2, 3.", 
+                "Is everything working as expected in my test?",
+                "Is it working?")
+  tokenDf <- sentenceTokenParse(testDocs)$tokens
+  
+  similDf <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                           token = tokenDf$token,
+                           docId = tokenDf$docId)
+  
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal) %>% 
+    dplyr::mutate(value = round(value, 5))
+  
+  expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"),
+                               value = c(0.25676, 0.48649, 0.25676),
+                               stringsAsFactors = FALSE)
+  
+  expect_identical(testResult, expectedResult)
+  
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE) %>% 
+    dplyr::mutate(value = round(value, 5))
+  
+  expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"),
+                               value = c(0.25676, 0.48649, 0.25676),
+                               stringsAsFactors = FALSE)
+  
+  expect_identical(testResult, expectedResult)
+  
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE) %>% 
+    dplyr::mutate(value = round(value, 5))
+  
+  expectedResult <- data.frame(sentenceId = c("2_1", "1_1", "3_1"),
+                               value = c(2, 1, 1),
+                               stringsAsFactors = FALSE)
+  
+  expect_identical(testResult, expectedResult)
+})
diff --git a/tests/testthat/test-sentenceParse.R b/tests/testthat/test-sentenceParse.R
new file mode 100644
index 0000000..3eae29a
--- /dev/null
+++ b/tests/testthat/test-sentenceParse.R
@@ -0,0 +1,47 @@
+context("sentenceParse")
+
+# test sentence object structure-----------------------------------------------
+
+test_that("sentenceParse output class and structure check", {
+  testDoc <- "Testing one, two, three. Is everything working as expected Mr. Wickham?"
+  testResult <- sentenceParse(testDoc)
+  
+  expect_equal(class(testResult), "data.frame")
+  
+  expect_equal(names(testResult), c("docId", "sentenceId", "sentence"))
+  
+  expect_true(is.numeric(testResult$docId))
+  expect_true(is.character(testResult$sentenceId))
+  expect_true(is.character(testResult$sentence))
+  
+})
+
+# test bad input -------------------------------------------------------
+test_that("test input checking", {
+  expect_error(sentenceParse(NULL))
+  expect_error(sentenceParse(data.frame(badInput="test")))
+  
+  expect_error(sentenceParse("test", docId = c("fake","fake2")))
+  expect_error(sentenceParse(c("test","test2"), docId = "fake"))
+  expect_error(sentenceParse(c("test","test2"), docId = NULL))
+})
+
+# test sentence output value -----------------------------------------------
+
+test_that("Example doc parses sentences as expected", {
+  testDoc <- "Testing one, two, three. Is everything working as expected Mr. Wickham?"
+  testResult <- sentenceParse(testDoc)
+  
+  expectedResult <- data.frame(docId = c(1L, 1L), 
+                               sentenceId = c("1_1", "1_2"),
+                               sentence = c("Testing one, two, three.", 
+                                            "Is everything working as expected Mr. Wickham?"),
+                               stringsAsFactors = FALSE)
+  
+  expect_equal(testResult, expectedResult)
+  
+  expect_equal(class(testResult), "data.frame")
+  
+  expect_equal
+  
+})
\ No newline at end of file
diff --git a/tests/testthat/test-sentenceSimil.R b/tests/testthat/test-sentenceSimil.R
new file mode 100644
index 0000000..f42ab5b
--- /dev/null
+++ b/tests/testthat/test-sentenceSimil.R
@@ -0,0 +1,70 @@
+context("sentenceSimil")
+
+# test object out str and class ---------------------------------------
+test_that("testing result str and class", {
+  testDocs <- c("Testing 1, 2, 3.", 
+                "Is everything working as expected in my test?",
+                "Is it working?")
+  tokenDf <- sentenceTokenParse(testDocs)$tokens
+  
+  testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                              token = tokenDf$token,
+                              docId = tokenDf$docId)
+  
+  expect_equal(class(testResult), "data.frame")
+  expect_equal(names(testResult), c("sent1","sent2","similVal"))
+  
+  expect_true(is.character(testResult$sent1))
+  expect_true(is.character(testResult$sent2))
+  expect_true(is.numeric(testResult$similVal))
+})
+  
+
+test_that("bad input", {
+  expect_error(sentenceSimil(sentenceId = c("1_1"),
+                             token = c("word","word2"),
+                             docId = c(1,2)))
+  
+  expect_error(sentenceSimil(sentenceId = c("1_1", "2_1"),
+                             token = c(1,2),
+                             docId = c(1,2)))
+  
+  testDocs <- c("test","test")
+  tokenDf <- sentenceTokenParse(testDocs)$tokens
+  
+  expect_error(sentenceSimil(sentenceId = tokenDf$sentenceId,
+                             token = tokenDf$token,
+                             docId = tokenDf$docId))
+  
+  testDocs <- c("1","2")
+  tokenDf <- sentenceTokenParse(testDocs)$tokens
+  
+  expect_error(sentenceSimil(sentenceId = tokenDf$sentenceId,
+                             token = tokenDf$token,
+                             docId = tokenDf$docId))
+})
+
+# test output value ---------------------------------------
+
+test_that("output value check", {
+  testDocs <- c("Testing 1, 2, 3.", 
+                "Is everything working as expected in my test?",
+                "Is it working?")
+  tokenDf <- sentenceTokenParse(testDocs)$tokens
+  
+  testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                              token = tokenDf$token,
+                              docId = tokenDf$docId) %>% 
+    dplyr::mutate(similVal = round(similVal, 5))
+  
+  expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"),
+                               sent2 = c("2_1", "3_1", "3_1"),
+                               similVal = c(0.32718, 0, 0.32718),
+                               stringsAsFactors = FALSE)
+  
+  expect_equal(testResult, expectedResult)
+  
+})
+
+
+
diff --git a/tests/testthat/test-sentenceTokenParse.R b/tests/testthat/test-sentenceTokenParse.R
new file mode 100644
index 0000000..4e5b0a8
--- /dev/null
+++ b/tests/testthat/test-sentenceTokenParse.R
@@ -0,0 +1,40 @@
+context("sentenceTokenParse")
+
+# test output classes ----------------------------------------
+test_that("object class and structure check", {
+  testDocs <- c("12345", "Testing 1, 2, 3.", "Is everything working as expected Mr. Wickham?")
+  testResult <- sentenceTokenParse(testDocs)
+  
+  expect_equal(class(testResult), "list")
+  
+  expect_equal(unique(vapply(testResult, class, character(1))), "data.frame")
+  expect_equal(names(testResult$tokens), c("docId","sentenceId","token"))
+  
+  expect_true(is.numeric(testResult$tokens$docId))
+  expect_true(is.character(testResult$tokens$sentenceId))
+  expect_true(is.character(testResult$tokens$sentence))
+})
+
+# test output value -------------------------------------------
+
+test_that("All clean options TRUE", {
+  testDocs <- c("Testing 1, 2, 3.", "Is everything working as expected Mr. Wickham?")
+  testResult <- sentenceTokenParse(testDocs,
+                                   docId = "create",
+                                   removePunc=TRUE,
+                                   removeNum=TRUE,
+                                   toLower=TRUE,
+                                   stemWords=TRUE,
+                                   rmStopWords=TRUE)
+  
+  expectedResultSentences <- sentenceParse(testDocs)
+  expectedResultTokens <- lexRankr::tokenize(testDocs) %>% 
+    unlist() %>% 
+    .[which(!is.na(.))]
+
+  expect_equal(testResult$sentences, expectedResultSentences)
+  expect_equal(testResult$tokens$token, expectedResultTokens)
+  
+  expect_equal(class(testResult), "list")
+})
+
diff --git a/tests/testthat/test-tokenize.R b/tests/testthat/test-tokenize.R
new file mode 100644
index 0000000..47c292e
--- /dev/null
+++ b/tests/testthat/test-tokenize.R
@@ -0,0 +1,144 @@
+context("tokenize")
+
+# test tokenize output classes ----------------------------------------
+test_that("All clean options TRUE", {
+  testDocs <- c("12345", "Testing 1, 2, 3.", "Is everything working as expected Mr. Wickham?")
+  testResult <- tokenize(testDocs)
+  
+  expect_equal(class(testResult), "list")
+  
+  expect_equal(unique(vapply(testResult, class, character(1))), "character")
+})
+
+# test bad input -------------------------------------------------------
+test_that("test input checking", {
+  expect_error(tokenize(NULL))
+  expect_error(tokenize(data.frame(badInput="test")))
+  
+  expect_error(tokenize("test", removePunc=NULL))
+  expect_error(tokenize("test", removeNum=NULL))
+  expect_error(tokenize("test", toLower=NULL))
+  expect_error(tokenize("test", stemWords=NULL))
+  expect_error(tokenize("test", rmStopWords=NULL))
+})
+
+# test tokenize and arg option variations ------------------------------
+
+test_that("All clean options TRUE", {
+  testDocs <- c("Testing 1, 2, 3.", "Is everything working as expected Mr. Wickham?")
+  testResult <- tokenize(testDocs,
+                         removePunc=TRUE,
+                         removeNum=TRUE,
+                         toLower=TRUE,
+                         stemWords=TRUE,
+                         rmStopWords=TRUE)
+  
+  expectedResult <- list("test", c("work","expect","mr","wickham"))
+  
+  expect_equal(testResult, expectedResult)
+  
+  expect_equal(class(testResult), "list")
+})
+
+test_that("All clean options FALSE", {
+  testDocs <- c("Testing 1, 2, 3", "Is everything working as expected Mr. Wickham?")
+  testResult <- tokenize(testDocs,
+                         removePunc=FALSE,
+                         removeNum=FALSE,
+                         toLower=FALSE,
+                         stemWords=FALSE,
+                         rmStopWords=FALSE)
+  
+  expectedResult <- list(c("Testing", "1", ",", "2", ",", "3"), 
+                         c("Is", "everything", "working", "as", "expected", "Mr", ".", "Wickham", "?"))
+  
+  expect_equal(testResult, expectedResult)
+  
+  expect_equal(class(testResult), "list")
+})
+
+test_that("Single option tests: removePunc = FALSE", {
+  testDocs <- c("Testing 1, 2, 3.", "Is everything working as expected Mr. Wickham?")
+  testResult <- tokenize(testDocs,
+                         removePunc=FALSE,
+                         removeNum=TRUE,
+                         toLower=TRUE,
+                         stemWords=TRUE,
+                         rmStopWords=TRUE)
+  
+  expectedResult <- list(c("test",",",",","." ), 
+                         c("work","expect","mr",".","wickham","?" ))
+  
+  expect_equal(testResult, expectedResult)
+  
+  expect_equal(class(testResult), "list")
+})
+
+test_that("Single option tests: removeNum = FALSE", {
+  testDocs <- c("Testing 1, 2, 3", "Is everything working as expected Mr. Wickham?")
+  testResult <- tokenize(testDocs,
+                         removePunc=TRUE,
+                         removeNum=FALSE,
+                         toLower=TRUE,
+                         stemWords=TRUE,
+                         rmStopWords=TRUE)
+  
+  expectedResult <- list(c("test","1","2","3"), 
+                         c("work","expect","mr","wickham"))
+  
+  expect_equal(testResult, expectedResult)
+  
+  expect_equal(class(testResult), "list")
+})
+
+test_that("Single option tests: toLower = FALSE", {
+  testDocs <- c("Testing 1, 2, 3", "Is everything working as expected Mr. Wickham?")
+  testResult <- tokenize(testDocs,
+                         removePunc=TRUE,
+                         removeNum=TRUE,
+                         toLower=FALSE,
+                         stemWords=TRUE,
+                         rmStopWords=TRUE)
+  
+  expectedResult <- list(c("Test"), 
+                         c("work","expect","Mr","Wickham"))
+  
+  expect_equal(testResult, expectedResult)
+  
+  expect_equal(class(testResult), "list")
+})
+
+test_that("Single option tests: stemWords = FALSE", {
+  testDocs <- c("Testing 1, 2, 3", "Is everything working as expected Mr. Wickham?")
+  testResult <- tokenize(testDocs,
+                         removePunc=TRUE,
+                         removeNum=TRUE,
+                         toLower=TRUE,
+                         stemWords=FALSE,
+                         rmStopWords=TRUE)
+  
+  expectedResult <- list(c("testing"), 
+                         c("working","expected","mr","wickham"))
+  
+  expect_equal(testResult, expectedResult)
+  
+  expect_equal(class(testResult), "list")
+})
+
+test_that("Single option tests: rmStopWords = FALSE", {
+  testDocs <- c("Testing 1, 2, 3", "Is everything working as expected Mr. Wickham?")
+  testResult <- tokenize(testDocs,
+                         removePunc=TRUE,
+                         removeNum=TRUE,
+                         toLower=TRUE,
+                         stemWords=TRUE,
+                         rmStopWords=FALSE)
+  
+  expectedResult <- list(c("test"), 
+                         c("i","everyth","work","a", "expect", "mr", "wickham"))
+  
+  expect_equal(testResult, expectedResult)
+  
+  expect_equal(class(testResult), "list")
+})
+
diff --git a/tests/testthat/test-unnest_sentences.R b/tests/testthat/test-unnest_sentences.R
new file mode 100644
index 0000000..e9e5f5d
--- /dev/null
+++ b/tests/testthat/test-unnest_sentences.R
@@ -0,0 +1,55 @@
+context("unnest_sentences")
+
+# test output str --------------------------------------------------------
+test_that("correct ouput class and str", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  test_result <- unnest_sentences(df, out, text)
+  
+  expect_equal(dim(test_result), c(4,3))
+  expect_true(is.data.frame(test_result))
+  expect_equal(names(test_result), c("doc_id","sent_id","out"))
+  
+  test_result <- unnest_sentences(df, out, text, drop=FALSE)
+  
+  expect_equal(dim(test_result), c(4,4))
+  expect_equal(names(test_result), c("doc_id","text","sent_id","out"))
+})
+
+# test bad input -------------------------------------------------------
+test_that("test input checking", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  expect_error(unnest_sentences(df, out, fake))
+  expect_error(unnest_sentences(NULL, out, text))
+  expect_error(unnest_sentences(df, out, text, drop = NULL))
+})
+
+# test output val ------------------------------------------------------
+test_that("output value", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  test_result     <- unnest_sentences(df, out, text)
+  expected_result <- data.frame(doc_id = c(1L, 1L, 2L, 3L), 
+                                sent_id = c(1L, 2L, 1L, 1L), 
+                                out = c("Testing the system.", 
+                                        "Second sentence for you.", 
+                                        "System testing the tidy documents df.", 
+                                        "Documents will be parsed and lexranked."),
+                                stringsAsFactors = FALSE)
+  
+  expect_equal(test_result, expected_result)
+})
+
diff --git a/tests/testthat/test-unnest_sentences_.R b/tests/testthat/test-unnest_sentences_.R
new file mode 100644
index 0000000..45fa9d6
--- /dev/null
+++ b/tests/testthat/test-unnest_sentences_.R
@@ -0,0 +1,56 @@
+context("unnest_sentences_")
+
+# test output str --------------------------------------------------------
+test_that("correct ouput class and str", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  test_result <- unnest_sentences_(df, "out", "text")
+  
+  expect_equal(dim(test_result), c(4,3))
+  expect_true(is.data.frame(test_result))
+  expect_equal(names(test_result), c("doc_id","sent_id","out"))
+  
+  test_result <- unnest_sentences_(df, "out", "text", drop=FALSE)
+  
+  expect_equal(dim(test_result), c(4,4))
+  expect_equal(names(test_result), c("doc_id","text","sent_id","out"))
+})
+
+# test bad input -------------------------------------------------------
+test_that("test input checking", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  expect_error(unnest_sentences_(df, "out", "fake"))
+  expect_error(unnest_sentences_(NULL, "out", "text"))
+  expect_error(unnest_sentences_(df, "out", "text", drop = NULL))
+  expect_warning(unnest_sentences_(df, "out", "text", c("test","test2")))
+})
+
+# test output val ------------------------------------------------------
+test_that("output value", {
+  df <- data.frame(doc_id = 1:3, 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  test_result     <- unnest_sentences_(df, "out", "text")
+  expected_result <- data.frame(doc_id = c(1L, 1L, 2L, 3L), 
+                                sent_id = c(1L, 2L, 1L, 1L), 
+                                out = c("Testing the system.", 
+                                        "Second sentence for you.", 
+                                        "System testing the tidy documents df.", 
+                                        "Documents will be parsed and lexranked."),
+                                stringsAsFactors = FALSE)
+  
+  expect_equal(test_result, expected_result)
+})
+
diff --git a/vignettes/Analyzing_Twitter_with_LexRankr.html.asis b/vignettes/Analyzing_Twitter_with_LexRankr.html.asis
new file mode 100644
index 0000000..698eea1
--- /dev/null
+++ b/vignettes/Analyzing_Twitter_with_LexRankr.html.asis
@@ -0,0 +1,4 @@
+%\VignetteIndexEntry{Analyzing Twitter with LexRankr}
+%\VignetteEngine{R.rsp::asis}
+%\VignetteKeyword{twitter}
+%\VignetteKeyword{lexrankr}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-lexrankr.git



More information about the debian-med-commit mailing list