摘要: 仅用于记录R语言学习过程:
内容提要:
字符串的处理、正则表达式、stringi包和stringr包
正文:
字符串的处理
n 导读:
u nchar(x)函数:字符串的个数:
> x <- c('fudan','jiaoda')
> nchar(x)
[1] 5 6 #返回字符串的个数
u length()函数:返回元素的个数
> length(x)
[1] 2
u toupper()函数:小写转大写
> toupper('abc')
[1] "ABC"
u tolower()函数:大写转小写
> tolower('ABKC')
[1] "abkc"
u paste()函数:(seq参数和collapse参数)粘贴功能
> stringa <- LETTERS[1:5]
> STRINGB <- 1:5
> paste(stringa,STRINGB)
[1] "A 1" "B 2" "C 3" "D 4" "E 5"
> paste(stringa,STRINGB,seq = '-') #seq分隔符
[1] "A 1 -" "B 2 -" "C 3 -" "D 4 -" "E 5 -"
> paste(stringa,STRINGB,collapse = '-') # collapse分隔符
[1] "A 1-B 2-C 3-D 4-E 5"
u paste0()函数:去掉了A和1之间的空格,seq和collapse的表型也不同
> paste0(stringa,STRINGB)
[1] "A1" "B2" "C3" "D4" "E5"
> paste0(stringa,STRINGB,seq = '-')
[1] "A1-" "B2-" "C3-" "D4-" "E5-"
> paste0(stringa,STRINGB,collapse = '-')
[1] "A1-B2-C3-D4-E5"
u strsplit()函数:字符串拆分功能
> stringC <- paste(stringa, STRINGB, seq = '/')
> strsplit(stringC,split = '/') #根据/ 进行拆分
[[1]]
[1] "A 1 "
[[2]]
[1] "B 2 "
[[3]]
[1] "C 3 "
[[4]]
[1] "D 4 "
[[5]]
[1] "E 5 "
u substr()函数:字符串截取函数;同时具有赋值功能
> stringd <- c('python','java','ruby','php','linux')
> sub_str <- substr(stringd,start = 2,stop = 4) #截取2-4位的字符,如果不够,就有几个返回几个
> sub_str
[1] "yth" "ava" "uby" "hp" "inu"
#实现赋值的功能
> substr(stringd,start = 2,stop = 4) <- 'aaa'
> stringd
[1] "paaaon" "jaaa" "raaa" "paa" "laaax"
u grep()函数:用于提取字符串中指定的字符,可返回位置,也可返回具体的值。
> seq_names <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008',
+ 'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05',
+ 'NA_USA03_C2_S2007','NA USA04 A3 2004',
+ 'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996')
> fra_seq <- grep(pattern = 'FRA|fra',x =seq_names)
> fra_seq
[1] 1 5 11
> seq_names[fra_seq]
[1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007"
[3] "eu_fra_a2_s98"
> fra_seq <- grep(pattern = 'FRA|fra',x =seq_names,value = TRUE)
> fra_seq
[1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007"
[3] "eu_fra_a2_s98"
u grepl()函数:返回的是逻辑值。没有value参数。ignore.case参数表示是否忽略大小写,TRUE为忽略。
> grepl(pattern = 'FRA|fra',x =seq_names)
[1] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
[10] FALSE TRUE FALSE
> fra_seq <- grepl(pattern = 'FRA|fra',x =seq_names,value = TRUE) #或
u 正则表达式:提取元素
> spe_seq <- seq_names[!grepl(pattern = '[s|S][0-9]{2,4}\\b',seq_names)] #匹配右边界
> spe_seq
[1] "AF_COM12_80_20014" "AS_CHN11_C3_2004"
[3] "NAUSA02E02005" "AS_CHN12_N0_05"
[5] "NA USA04 A3 2004" "EU_UK01_A0_2009"
[7] "SA/BRA08/00/1996"
找到以ab开头的
my_string <- c('above','about','abrotion','cab')
grep(pattern = '\\bab',x = my_string,value = T) #匹配左边界
u gsub()函数:把字符串变成数值,会把找到的所有字符都替换掉
money <- c('$1888','$2888','$3888')
gsub('\\$',replacement = '',money)
as.numeric(money)
u sub()函数:只会替换掉找到的第一个字符
> money <- c('$1888 $2888 $3888')
> sub('\\$',replacement = '',money)
[1] "1888 $2888 $3888"
> gsub('\\$',replacement = '',money)
[1] "1888 2888 3888"
u regexpr()函数:
> test_string <- c('happy','apple','application','apolitic')
> regexpr('pp',test_string)
[1] 3 2 2 -1 #返回pp出现的位置,-1表示没有
attr(,"match.length")
[1] 2 2 2 -1
attr(,"useBytes")
[1] TRUE
> test_string[regexpr('pp',test_string)>0] #提取含pp的字符串
[1] "happy" "apple" "application"
u gregexpr()函数:同regexpr()函数
u regexec()函数:同regexpr()函数
u agrep()函数:可以匹配英美单词不同写法
> string1 <- c('I need a favour','my favorite sport','you made an error')
> agrep('favor',string1)
[1] 1 2
正则表达式
n 原义表达式:只代表自己
> mystring1 <- c('apple','orange')
> grep('p',mystring1)
[1] 1
n 转义表达式:代表其他含义
> # .所有字符
> mystring2 <- c('shudo','.dfs','-dsfd')
> grep('.',mystring2)
[1] 1 2 3
>
> mystring3 <- c('9anv','fss7','1000','ss7')
> grep('[7-9]',mystring3)
[1] 1 2 4
>
> # ^a,匹配a开头的
> mystring4 <- c('apple','application','abb')
> grep('^ap',mystring4)
[1] 1 2
> # [^]表示不是0-1
> mystring5 <- c('9anv','fss7','1000','ss7')
> grep('[^0-1]',mystring5)
[1] 1 2 4
> #{}代表重复的次数,{1,}表示重复大于1次
> mystring6 <- c('1220','2289','2228','10002')
> grep('2{2,3}',mystring6)
[1] 1 2 3
> # + 表示其最靠近的字符重复多次,()表示把括号内的内容看成一个整体
> mystring7 <- c('food','foot','foul','fans')
> grep ('fo+',mystring7)
[1] 1 2 3
> grep('fo{1,}',mystring7)
[1] 1 2 3
> grep('(fo){1,}',mystring7)
[1] 1 2 3
>
> #* 匹配0次或以上
> #| 管道符 或,满足其中之一就可被返回
>
> mystring8 <- c('kobe','messi','neymar')
> grep('^k|^m',mystring8)
[1] 1 2
> # $表示匹配字符串末尾
> mystring9 <- c('active','positive','negative','iention')
> grep('ive$',mystring9) #匹配字符串末尾
[1] 1 2 3
> grep('ive\\b',mystring9)
[1] 1 2 3
n 保义字符:
# \
mystring10 <- c('ac^bb','^df')
grep('\\^',mystring10)
[1] 1 2
\\d = [0-9] 匹配数字0-9
\\D = [^0-9] 匹配非数字
\\s 匹配空白字符,空格,制表符,换行符
\\S 匹配非空白字符
\\w 匹配字母和数字 =[a-zA-Z0-9]
\\W 匹配非字母和数字 =[^a-zA-Z0-9]
\\b 匹配字符的边界
\\B 匹配字符的非边界
\\< 匹配以空白字符开始的文本 如‘ string’
\\> 匹配以空白字符结束的文本 如‘string ’
示例:
> mystring11 <- c('2013','abcd','13sg')
> grep('\\d',mystring11)
[1] 1 3
> grep('\\D',mystring11)
[1] 2 3
> mystring12 <- c('foo t',' able',' moth er','happy')
> grep('\\s',mystring12)
[1] 1 2 3
> grep('\\S',mystring12)
[1] 1 2 3 4
> mystring13 <- c('theory','the republic','they')
> grep('\\<the\\>',mystring13) #以the作为边界的字符串,the为一个单独的单词
[1] 2
stringr与stringi包
n stringi包更加依赖正则表达式
n stringr中的常用函数
u str_c()函数:类似paste()函数
> str_c('a','b')
[1] "ab"
> str_c('a','b',sep = '-')
[1] "a-b"
u str_length()函数:用于字符串计数
> str_length('abdc')
[1] 4
u str_sub()函数:用于字符串提取,类似substr()函数,有三个参数:数据名,开始位置,结束位置(可以接受向量),可以接受赋值
> yxf <- 'yi xue fang'
> str_sub(yxf,c(1,4,8),c(2,6,11))
[1] "yi" "xue" "fang"
>
> str_sub(yxf,1,1) <- 'Y' #可以接受赋值
> yxf
[1] "Yi xue fang"
u str_dup()函数:用于复制
> fruit <- c('apple','pear','banana')
> str_dup(fruit,2)
[1] "appleapple" "pearpear" "bananabanana"
> fruit <- c('apple','pear','banana')
> str_dup(fruit,2:4)
[1] "appleapple" "pearpearpear"
[3] "bananabananabananabanana"
u str_trim()函数:去掉字符串首尾的空格,也可以设置成right和left,分别去掉右边和左边的空格
> string <- ' Eternal love for YanQ '
> str_trim(string,side = 'both')
[1] "Eternal love for YanQ"
u str_extract()函数:用于提取
phones <- c('219 733 8965','329-293-8753','banana','595 794 7569',
'387 287 6718','apple','233.398.9187','482 952 3315',
'239 923 8115 and 842 566 4692','Work: 579-499-7527','$1000',
'Home:543.355.3679')
str_extract(phones,'([0-9]{3})[- .]([0-9]{3})[- .]([0-9]{4})\\b')
[1] "219 733 8965" "329-293-8753" NA "595 794 7569" "387 287 6718"
[6] NA "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527"
[11] NA "543.355.3679"
或写成:str_extract(phones,'([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})')
u str_replace()函数:用于字符串替换,只替换找到的第一个
> fruits <- c('one apple','two pears','three bananas')
> str_replace(fruits,'[aeiou]','-') #[被替换的对象] ,‘拟替换成的对象’
[1] "-ne apple" "tw- pears" "thr-e bananas"
str_replace_all()函数:替换所有
> fruits <- c('one apple','two pears','three bananas')
> str_replace_all(fruits,'[aeiou]','-')
[1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
n stringi中的常用函数
u stri_join()函数:
> stri_join(1:7,letters[1:7],sep = '-')
[1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g"
> stri_join(1:7,letters[1:7],collapse = '-')
[1] "1a-2b-3c-4d-5e-6f-7g"
u stri_cmp_eq() & stri_cmp_neq()函数:
> stri_cmp_eq('ab','ab')
[1] TRUE
> stri_cmp_neq('ab','ab')
[1] FALSE
u stri_cmp_lt() & stri_cmp_gt()函数:用于字符串比大小,lt 前者小于后者,gt前者大于后者
> stri_cmp_lt('121','221')
[1] TRUE
> stri_cmp_lt('a121','b221')
[1] TRUE
> stri_cmp_gt('121','221')
[1] FALSE
u stri_count()函数:用于计数
> language <- c('python','R','PHP','Ruby','Java',
+ 'JavaScript','C','Oracle','C++','C#','Spark',
+ 'Go','Room','Good','Pathon','ScriptJava','R2R','C+','C*')
> stri_count(language,fixed = 'R')
[1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0
> stri_count(language,regex = '^J')
[1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
u stri_count_boundaries()函数:字符串元素个数的计数
> test <- 'The\u00a0above-mentioned features are very useful.
+ Warm thanks to their developers. Tomorrow is a ,new$% day###'
> stri_count_boundaries(test,type = 'word')
[1] 45
> stri_count_boundaries(test,type = 'sentence')
[1] 3
> stri_count_boundaries(test,type = 'character')
[1] 110
u stri_duplicated()函数:识别重复的字符串
> stri_duplicated(c('a','b','a',NA,'a',NA))
[1] FALSE FALSE TRUE FALSE TRUE TRUE
> stri_duplicated(c('a','b','a',NA,'a',NA),fromLast = T) #从最后开始看
[1] TRUE FALSE TRUE TRUE FALSE FALSE
> stri_duplicated_any(c('a','b','a',NA,'a',NA))
[1] 3
u stri_dup()函数:重复
> stri_dup(c('abc','parst'),c(4,2))
[1] "abcabcabcabc" "parstparst"
u stri_detect_fixec()函数:发现匹配函数
> stri_detect_fixed(c('stringi R','REXAMINE','123'),c('i','R','0'))
[1] TRUE TRUE FALSE
u stri_detect_regex()函数:
> stri_detect_regex(c('above','abort','about','abnormal','abandon'),'^ab')
[1] TRUE TRUE TRUE TRUE TRUE
> stri_detect_regex(c('above','abort','about','abnormal','abandon'),'t\\b')
[1] FALSE TRUE TRUE FALSE FALSE
> stri_detect_regex(c('ABOUT','abort','AboVE'),'^ab',case_insensitive = TRUE) #忽略大小写
[1] TRUE TRUE TRUE
u stri_startswith_fixed()函数:
> stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a')
[1] TRUE TRUE FALSE TRUE FALSE
>
> stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a1')
[1] TRUE FALSE FALSE FALSE FALSE
>
> stri_startswith_fixed(c('abaDc','aabadc','ababa'),'ba',from = 2) #从哪个字符开始匹配,从第二个字符开始匹配
[1] TRUE FALSE TRUE
u stri_endswith_fixed()函数:
> stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba')
[1] FALSE FALSE TRUE
> stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba', to = 3) #匹配到第几位,匹配到第三位
[1] TRUE FALSE TRUE
u stri_extract_all()函数:提取
> tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008',
+ 'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05',
+ 'NA_USA03_C2_S2007','NA USA04 A3 2004',
+ 'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996')
>
> # Generate a strings composed by several sequence names.
>
> stri_extract_all(tEmp_text,regex = '[0-9]{2,4}\\b')
[[1]]
[1] "2008"
[[2]]
[1] "0014"
[[3]]
[1] "2008"
[[4]]
[1] "2004"
[[5]]
[1] "2007"
[[6]]
[1] "2005"
[[7]]
[1] "05"
[[8]]
[1] "2007"
[[9]]
[1] "04" "2004"
[[10]]
[1] "2009"
[[11]]
[1] "98"
[[12]]
[1] "08" "00" "1996"
u stri_extract_all_fixed()函数:
> stri_extract_all_fixed('abaBAba','Aba',case_insensitive = T, overlap =T)
[[1]] #可交叉
[1] "aba" "aBA" "Aba"
u stri_extract_all_boundaries()函数:提取字符串的边界
> stri_extract_all_boundaries('stringi: THE string processing package 123.48...')
[[1]]
[1] "stringi: " "THE " "string " "processing " "package "
[6] "123.48..." #但是带出来单词后面的空格
u stri_extract_all_words()函数:提取字符串的边界,去掉空格
> stri_extract_all_words('stringi: THE string processing package 123.48...')
[[1]]
[1] "stringi" "THE" "string" "processing" "package" "123.48"
u stri_isempty()函数:字符串内是否为空
> stri_isempty(c(',','','abc','123','\u0105\u0104',' '))
[1] FALSE TRUE FALSE FALSE FALSE FALSE
u stri_locate_all()函数:定位函数
> stri_locate_all('I want to learn R to promote my statistical skills',fixed = 'to')
[[1]]
start end
[1,] 8 9
[2,] 19 20 #返回的是位置,起始和结束,可用于提取