celestialgod

convertion of fullwidth digit and digits in chinese

Jun 15th, 2016
371
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 4.40 KB | None | 0 0
  1. address <- c("宜蘭縣數學鎮數學里10鄰數學路100巷16之2號",
  2. "基隆市太陽區太陽里17鄰太陽三街223之2號十九樓",
  3. "基隆市白雲區白雲里20鄰白雲三街59號十樓之1",
  4. "新竹市海洋區海洋里13鄰海洋路29號六樓",
  5. "臺北市小明區小名里20鄰小名路222號二十樓",
  6. "新北市語文區語文里17鄰語文路221號二十九樓之5",
  7. "宜蘭縣飛機鎮飛機里3鄰飛機路73號",
  8. "新北市紅色區紅色里15鄰紅色路四段15號之4十七樓")
  9.  
  10. library(magrittr)
  11. library(plyr)
  12. library(stringr)
  13. library(stringi)
  14.  
  15. address_converted <- sapply(address, function(x){
  16.   raw_address <- charToRaw(x)
  17.   loc_maybe_fullwidth_digits <- which(raw_address == "a2")
  18.   second_loc <- raw_address[loc_maybe_fullwidth_digits+1] %>% as.integer
  19.   loc_fullwidth_digits <- loc_maybe_fullwidth_digits[second_loc >= 175 & second_loc <= 184] + 1
  20.   raw_address[loc_fullwidth_digits] <- raw_address[loc_fullwidth_digits] %>%
  21.     as.integer %>% '-'(127) %>% as.raw
  22.   return(rawToChar(raw_address[setdiff(1:length(raw_address), loc_fullwidth_digits-1)]))
  23. }) %>% `names<-`(NULL)
  24. # [1] "宜蘭縣數學鎮數學里10鄰數學路100巷16之2號"    
  25. # [2] "基隆市太陽區太陽里17鄰太陽三街223之2號十九樓"
  26. # [3] "基隆市白雲區白雲里20鄰白雲三街59號十樓之1"    
  27. # [4] "新竹市海洋區海洋里13鄰海洋路29號六樓"        
  28. # [5] "臺北市小明區小名里20鄰小名路222號二十樓"      
  29. # [6] "新北市語文區語文里17鄰語文路221號二十九樓之5"
  30. # [7] "宜蘭縣飛機鎮飛機里3鄰飛機路73號"              
  31. # [8] "新北市紅色區紅色里15鄰紅色路四段15號之4十七樓"
  32.  
  33. chinese2digits <- function(x){
  34.   vals <- sapply(str_split(x, "")[[1]], function(chi_digit){
  35.     mapvalues(chi_digit, c("零", "一", "二", "三", "四", "五", "六", "七", "八", "九",
  36.                            "十", "百", "千", "萬", "億"), c(0:10, 10^c(2,3,4,8)), FALSE)
  37.   }) %>% as.integer
  38.   digit_output <- 0
  39.   base_term <- 1
  40.   for (i in rev(seq_along(vals)))
  41.   {
  42.     if (vals[i] >= 10 && i == 1)
  43.     {
  44.       base_term <- ifelse(vals[i] > base_term, vals[i], base_term * vals[i])
  45.       digit_output <- digit_output + vals[i]
  46.     } else if (vals[i] >= 10)
  47.     {
  48.       base_term <- ifelse(vals[i] > base_term, vals[i], base_term * vals[i])
  49.     } else
  50.     {
  51.       digit_output <- digit_output + base_term * vals[i]
  52.     }
  53.   }
  54.   return(digit_output)
  55. }
  56.  
  57. ## test
  58. # chinese2digits("一百五十二") # 152
  59. # chinese2digits("一億零八萬零三百二十三") # 100080323
  60. # chinese2digits("十九") # 19
  61.  
  62. address_converted2 <- sapply(address_converted, function(x){
  63.   pattern_starts <- "[零一二三四五六七八九十百千萬億]+樓"
  64.   if (!str_detect(x, pattern_starts))
  65.     return(x)
  66.   stairs <- str_extract(x, pattern_starts)
  67.   x <- str_replace(x, str_c("(\\d+)(", pattern_starts, ")"), "\\1, \\2")
  68.   x <- str_replace(stairs, "樓", "") %>% chinese2digits %>% str_c("樓") %>%
  69.     {str_replace(x, stairs, .)}
  70.   return(x)
  71. }) %>% `names<-`(NULL)
  72.  
  73. # [1] "宜蘭縣數學鎮數學里10鄰數學路100巷16之2號"      "基隆市太陽區太陽里17鄰太陽三街223之2號19樓"  
  74. # [3] "基隆市白雲區白雲里20鄰白雲三街59號10樓之1"     "新竹市海洋區海洋里13鄰海洋路29號6樓"          
  75. # [5] "臺北市小明區小名里20鄰小名路222號20樓"         "新北市語文區語文里17鄰語文路221號29樓之5"    
  76. # [7] "宜蘭縣飛機鎮飛機里3鄰飛機路73號"               "新北市紅色區紅色里15鄰紅色路四段15號之4, 17樓"
  77.  
  78. sapply(address_converted2, str_extract_all, pattern = "\\d+")
  79. # $`宜蘭縣數學鎮數學里10鄰數學路100巷16之2號`
  80. # [1] "10"  "100" "16"  "2"  
  81. #
  82. # $基隆市太陽區太陽里17鄰太陽三街223之2號19樓
  83. # [1] "17"  "223" "2"   "19"
  84. #
  85. # $`基隆市白雲區白雲里20鄰白雲三街59號10樓之1, `
  86. # [1] "20" "59" "10" "1"
  87. #
  88. # $新竹市海洋區海洋里13鄰海洋路29號6樓
  89. # [1] "13" "29" "6"
  90. #
  91. # $臺北市小明區小名里20鄰小名路222號20樓
  92. # [1] "20"  "222" "20"
  93. #
  94. # $`新北市語文區語文里17鄰語文路221號29樓之5, `
  95. # [1] "17"  "221" "29"  "5"  
  96. #
  97. # $宜蘭縣飛機鎮飛機里3鄰飛機路73號
  98. # [1] "3"  "73"
  99. #
  100. # $`新北市紅色區紅色里15鄰紅色路四段15號之4, 17樓`
  101. # [1] "15" "15" "4"  "17"
Add Comment
Please, Sign In to add comment