RDocumentation: tokens
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Note that this notebook was automatically generated from an RDocumentation page. It depends on the package and the example code whether this code will run without errors. You may need to edit the code to make things work.

    if(!require('quanteda')) {
        install.packages('quanteda')
        library('quanteda')
    }
    txt <- c(doc1 = "A sentence, showing how tokens() works.",
             doc2 = "@quantedainit and #textanalysis https://example.com?p=123.",
             doc3 = "Self-documenting code??",
             doc4 = "£1,000,000 for 50¢ is gr8 4ever \U0001f600")
    tokens(txt)
    tokens(txt, what = "word1")
    
    # removing punctuation marks but keeping tags and URLs
    tokens(txt[1:2], remove_punct = TRUE)
    
    # splitting hyphenated words
    tokens(txt[3])
    tokens(txt[3], split_hyphens = TRUE)
    
    # symbols and numbers
    tokens(txt[4])
    tokens(txt[4], remove_numbers = TRUE)
    tokens(txt[4], remove_numbers = TRUE, remove_symbols = TRUE)
    
    if (FALSE) # using other tokenizers
    tokens(tokenizers::tokenize_words(txt[4]), remove_symbols = TRUE)
    tokenizers::tokenize_words(txt, lowercase = FALSE, strip_punct = FALSE) %>%
        tokens(remove_symbols = TRUE)
    tokenizers::tokenize_characters(txt[3], strip_non_alphanum = FALSE) %>%
        tokens(remove_punct = TRUE)
    tokenizers::tokenize_sentences(
        "The quick brown fox.  It jumped over the lazy dog.") %>%
        tokens()