RDocumentation: tokens

Beta

Note that this notebook was automatically generated from an RDocumentation page. It depends on the package and the example code whether this code will run without errors. You may need to edit the code to make things work.

if(!require('quanteda')) {
    install.packages('quanteda')
    library('quanteda')
}

txt <- c(doc1 = "A sentence, showing how tokens() works.",
         doc2 = "@quantedainit and #textanalysis https://example.com?p=123.",
         doc3 = "Self-documenting code??",
         doc4 = "£1,000,000 for 50¢ is gr8 4ever \U0001f600")
tokens(txt)
tokens(txt, what = "word1")

# removing punctuation marks but keeping tags and URLs
tokens(txt[1:2], remove_punct = TRUE)

# splitting hyphenated words
tokens(txt[3])
tokens(txt[3], split_hyphens = TRUE)

# symbols and numbers
tokens(txt[4])
tokens(txt[4], remove_numbers = TRUE)
tokens(txt[4], remove_numbers = TRUE, remove_symbols = TRUE)

if (FALSE) # using other tokenizers
tokens(tokenizers::tokenize_words(txt[4]), remove_symbols = TRUE)
tokenizers::tokenize_words(txt, lowercase = FALSE, strip_punct = FALSE) %>%
    tokens(remove_symbols = TRUE)
tokenizers::tokenize_characters(txt[3], strip_non_alphanum = FALSE) %>%
    tokens(remove_punct = TRUE)
tokenizers::tokenize_sentences(
    "The quick brown fox.  It jumped over the lazy dog.") %>%
    tokens()