stringr 套件

stringr 套件是 tidyverse 套件系統提供的一套用於處理文字和字串的套件。


使用 string_c() 函式合併文字或字串。使用引數 sep 設定合併的中間字元。

str_c("medical", "statistics")
## [1] "medicalstatistics"
str_c("medical", "statistics", sep = " ")
## [1] "medical statistics"

若是遇到缺失值 NA,則仍回傳 NA,若要改變遇到缺失值 NA,回傳列印 NA,可以使用加用函式 str_replace_na()

x.char <- c("bio", NA, "statistics")
str_c("pre-", x.char, "-end")
## [1] "pre-bio-end" NA "pre-statistics-end"
str_c("pre-", str_replace_na(x.char), "-end")
## [1] "pre-bio-end" "pre-NA-end" "pre-statistics-end"

合併字串向量,使用引數 collpse

char.vec <- c("I", "love", "biostatistics")
str_c(char.vec, collapse = ", ")
## [1] "I, love, biostatistics"


使用 str_sub() 函式取字串的子串。

char.vec <- c("I", "love", "medical", "statistics")
str_sub(char.vec, start = 1, end = 3)
## [1] "I" "lov" "med" "sta"


套件 stringr 內的函式 str_trim()str_pad() 可以對文字或字串向量內的首尾之空白(white space)移除,或是加入。


str_trim(string, side = c("both", "left", "right"))
str_pad(string, width, side = c("left", "right", "both"), pad = " ")
str_trunc(string, width, side = c("right", "left", "center"), ellipsis = "...")

str_trim() 函式用於移除字串兩端的空白,其中 side 變數的取值:both 代表兩端,left 代表僅左邊,right 代表僅右邊。

str_pad() 函式用於向字串兩端增加空白,引數中 width 表示增加後字串的長度,side 與上個函式相同,pad 表示以什麼字元填充空白。

str_trunc() 用於創造省略號。width 引數表示包括省略號之後字串的長度,side 引數取 left、right、center。

veg.vec <- c("apple ", " eggplant ", " banana")
str_trim(veg.vec, side = c("both"))
## [1] "apple" "eggplant" "banana"
str_trim(veg.vec, side = c("left"))
## [1] "apple " "eggplant " "banana"
str_trim(veg.vec, side = c("right"))
## [1] "apple" " eggplant" " banana"
veg.vec <- c("apple ", " eggplant ", " banana")
str_pad("a", width = 15, side = c("both"), pad = " ")
## [1] " a "
str_pad("a", width = 15, side = c("both"), pad = c("_"))
## [1] "_______a_______"
str_pad(veg.vec, width = 15, side = c("both"))
## [1] " apple " " eggplant " " banana "
str_pad(veg.vec, width = 15, side = c("left"))
## [1] " apple " " eggplant " " banana"
str_pad(veg.vec, width = 15, side = c("right"))
## [1] "apple " " eggplant " " banana "
str_pad(veg.vec, width = 15, side = c("both"), pad = c("_"))
## [1] "____apple _____" "__ eggplant ___" "____ banana____"
char.vec <- c("I love biostatistics")
str_trunc(char.vec, width = 10, side = c("center"))
## [1] "I lo...ics"
str_trunc(char.vec, width = 10, side = c("left"))
## [1] "...tistics"
str_trunc(char.vec, width = 10, side = c("right"))
## [1] "I love ..."



str_detect() 函式用於偵測字串向量是否包含特定形式文字,回傳邏輯向量。說得通俗,就是判斷指定的字串中是否含有指定的子串。str_count() 函式用於偵測子串在指定的字串中出現了多少次。

str_detect(string, pattern, negate = FALSE)
str_count(string, pattern = "")


  • string:指定的字串。
  • pattern:要判斷的子串。
  • negate:若為TRUE,同時回傳沒有配對成功的邏輯向量。


函式 str_locate() 用於查找配對成功的字串之第一次位置,回傳矩陣,包含開始和末端的位置。另外函式 str_locate_all() 尋找配對成功的字串之所有位置,回傳列表。

char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_locate(char.vec, pattern = "ti")
## start end
## [1,] 4 5
## [2,] 7 8
## [3,] NA NA
## [4,] 9 10
str_locate_all(char.vec, pattern = "ti")
## [[1]]
## start end
## [1,] 4 5
## [2,] 7 8
## [[2]]
## start end
## [1,] 7 8
## [2,] 10 11
## [[3]]
## start end
## [[4]]
## start end
## [1,] 9 10


函式 str_subnet() 尋找字串向量內配對成功之第一次的元素內容。函式 str_which() 則尋找字串向量內配對成功之第一次索引。

char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_subset(char.vec, pattern = "ti")
## [1] "statistics" "biostatistics" "distribution"
str_which(char.vec, pattern = "ti")
## [1] 1 2 4


函式 str_extract() 尋找配對成功的字串之第 1 次位置,回傳字串向量。另外函式 str_extract_all() 尋找配對成功的字串之所有位置,回傳所有字串向量形成列表。引數 simplify = TRUE 簡化成文字矩陣。

char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_extract(char.vec, pattern = "ti")
## [1] "ti" "ti" NA "ti"
str_extract_all(char.vec, pattern = "ti")
## [[1]]
## [1] "ti" "ti"
## [[2]]
## [1] "ti" "ti"
## [[3]]
## character(0)
## [[4]]
## [1] "ti"


函式 str_match() 使用在群組尋找特定形式文字或字串,若尋到找配對成功的字串之第 1 次位置,回傳文字矩陣,第一欄位為完全配對成功的文字,其餘欄位為群組內個別配對成功的文字。另外函式 str_match_all() 尋找配對成功的字串之所有位置。

char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_match(char.vec, pattern = "(a|ti)")
## [,1] [,2]
## [1,] "a" "a"
## [2,] "a" "a"
## [3,] "a" "a"
## [4,] "ti" "ti"
str_match_all(char.vec, pattern = "(a|ti)")
## [[1]]
## [,1] [,2]
## [1,] "a" "a"
## [2,] "ti" "ti"
## [3,] "ti" "ti"
## [[2]]
## [,1] [,2]
## [1,] "a" "a"
## [2,] "ti" "ti"
## [3,] "ti" "ti"
## [[3]]
## [,1] [,2]
## [1,] "a" "a"
## [[4]]
## [,1] [,2]
## [1,] "ti" "ti"


函式 str_match() 使用在群組尋找特定形式文字或字串,若尋找到配對成功的字串之第1次位置,則使用其他特定字串替代置換。引數 replacement 設定新的替代字串置換原有尋找特定形式文字或字串。另外函式 str_replace_all() 尋找配對成功的字串之所有位置,同時使用其他特定字串替代置換。

char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_replace(char.vec, pattern = "ti", replacement = "--")
## [1] "sta--stics" "biosta--stics" "probability" "distribu--on"
str_replace_all(char.vec, pattern = "b", replacement = "+++")
## [1] "statistics" "+++iostatistics" "pro+++a+++ility" "distri+++ution"


函式 str_split() 使用在群組尋找特定形式文字或字串,若尋找到配對成功的字串之第 1 次位置,則從特定形式文字或字串分割字串向量,回傳分割結果為列表物件。其中引數 n 設定回傳物件的數目,simplify = TRUE 回傳物件簡化成文字矩陣。另外函式 str_split_fixed() 回傳物件簡化成文字矩陣且欄位(column)數目為 n。str_split_n() 回傳物件簡化成文字向量,長度為 n。

char.vec <- c("a b c", "d e", "bio-statistics required-courses")
str_split(char.vec, pattern = " ", n = Inf, simplify = FALSE)
## [[1]]
## [1] "a" "b" "c"
## [[2]]
## [1] "d" "e"
## [[3]]
## [1] "bio-statistics" "required-courses"
str_split(char.vec, pattern = " ", n = Inf, simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "a" "b" "c"
## [2,] "d" "e" ""
## [3,] "bio-statistics" "required-courses" ""
str_split_fixed(char.vec, pattern = " ", n = 2)
## [,1] [,2]
## [1,] "a" "b c"
## [2,] "d" "e"
## [3,] "bio-statistics" "required-courses"
str_split_fixed(char.vec, pattern = "-", n = 2)
## [,1] [,2]
## [1,] "a b c" ""
## [2,] "d e" ""
## [3,] "bio" "statistics required-courses"


有些時候在尋找特定形式的文字與字串,須要尋找不只一種特定的形式。例如,同時尋找 b 或 ti, 可以輸入 b|ti

char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_replace(char.vec, pattern = "b|ti", replacement = "--")
## [1] "sta--stics" "--iostatistics" "pro--ability" "distri--ution"
str_replace_all(char.vec, pattern = "b|ti", replacement = "+++")
## [1] "sta+++s+++cs" "+++iosta+++s+++cs" "pro+++a+++ility" "distri+++u+++on"

起始符號 ^ 可以尋找字串的起始具有特定形式,尾端符號 $ 可以尋找字串的尾端具有特定形式。例如 ^b 尋找字串的起始具有b,或 n$ 尋找字串的尾端具 n。

char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_replace(char.vec, pattern = "^b", replacement = "--")
## [1] "statistics" "--iostatistics" "probability" "distribution"
str_replace_all(char.vec, pattern = "n$", replacement = "+++")
## [1] "statistics" "biostatistics" "probability" "distributio+++"

有些時候需要尋找字串前後具有特定形式的文字與字串,例如,尋找在 ti 之前的字元,在 p 之後的字元等等。使用小括號 () 代表特定形式的前後順序。輸入 a(?=c) 表示在 a 之後有 c 字元,輸入 a(?!c) 表示在 a 之後無 c 字元,輸入 (?<=b)a 表示在 a 之前有 b 字元,輸入 (?<!b)a 表示在 a 之前無 b 字元。

char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_replace(char.vec, pattern = "t(?=i)", replacement = "--")
## [1] "sta--istics" "biosta--istics" "probability" "distribu--ion"
str_replace(char.vec, pattern = "t(?!i)", replacement = "--")
## [1] "s--atistics" "bios--atistics" "probabili--y" "dis--ribution"
str_replace(char.vec, pattern = "(?<=i)o", replacement = "--")
## [1] "statistics" "bi--statistics" "probability" "distributi--n"
str_replace_all(char.vec, pattern = "(?<=t)i", replacement = "--")
## [1] "stat--st--cs" "biostat--st--cs" "probability" "distribut--on"
str_replace(char.vec, pattern = "(?<!t)i", replacement = "--")
## [1] "statistics" "b--ostatistics" "probab--lity" "d--stribution"
str_replace_all(char.vec, pattern = "(?<!t)i", replacement = "--")
## [1] "statistics" "b--ostatistics" "probab--l--ty" "d--str--bution"


stringr 輸入意義
a?zero or one
a*zero or more
a+one or more
a{n}exactly n
a{n,}n or more
a{n,m}between n and m
x.vec <- c(".a.aa.aaa.aaaa")
str_replace(x.vec, pattern = "a?", replacement = "-")
## [1] "-.a.aa.aaa.aaaa"
str_replace(x.vec, pattern = "a*", replacement = "-")
## [1] "-.a.aa.aaa.aaaa"
str_replace(x.vec, pattern = "a+", replacement = "-")
## [1] ".-.aa.aaa.aaaa"
str_replace(x.vec, pattern = "a{2}", replacement = "-")
## [1] ".a.-.aaa.aaaa"
str_replace(x.vec, pattern = "a{2,}", replacement = "-")
## [1] ".a.-.aaa.aaaa"
str_replace(x.vec, pattern = "a{2,3}", replacement = "-")
## [1] ".a.-.aaa.aaaa"
char.vec <- c("statistics", "biostatistics",
"probability", "distribution")
str_replace(char.vec, pattern = "i?", replacement = "-")
## [1] "-statistics" "-biostatistics" "-probability" "-distribution"
str_replace(char.vec, pattern = "i*", replacement = "-")
## [1] "-statistics" "-biostatistics" "-probability" "-distribution"
str_replace(char.vec, pattern = "i+", replacement = "-")
## [1] "stat-stics" "b-ostatistics" "probab-lity" "d-stribution"
str_replace(char.vec, pattern = "i{2}", replacement = "-")
## [1] "statistics" "biostatistics" "probability" "distribution"
str_replace(char.vec, pattern = "i{2}", replacement = "-")
## [1] "statistics" "biostatistics" "probability" "distribution"
str_replace(char.vec, pattern = "i{2,3}", replacement = "-")
## [1] "statistics" "biostatistics" "probability" "distribution"


R Lang 尋找特定形式的文字與字串,可以使用程式語言通用的正規表示,在使用套件 stringr 輸入時有些差異。


  • 正規表示為單“/”表示的,使用“//”。例如正規表示 \! 表示驚嘆號 !,在 stringr 中表示作 \\!
  • 在上面的規則上,有幾個特例
    • 斜線“\”,正規表示作 \\stringr 表示作 \\\\
    • 製表元,正規表示作 \tstringr 仍表示作 \t
  • 新增表示
[:lower:]lowercase letters
[:upper:]uppercase letters
[:alnum:]letters and numbers
[:graph:]letters, numbers and punctuation
[:space:]space characters
[:blank:]space and tab, without new line
.every character exceppt a new line
