Advertisement
jasperlow

word counting

Apr 29th, 2019
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.54 KB | None | 0 0
  1. private static final String REGEX_NOT = "^";
  2. private static final String REGEX_WORD_EXPECTED = "[%s]+";
  3. private static final String REGEX_LOWERCASE = "a-z";
  4. private static final String REGEX_UPPERCASE = "A-Z";
  5. private static final String REGEX_NUMERIC = "0-9";
  6. private static final String REGEX_WHITESPACE = "\\s+";
  7. private static final String REGEX_ENGLISH = "[a-zA-Z]+";
  8. private static final String REGEX_CHINESE = "\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC";
  9.  
  10.  
  11. public static int getWordCounts(String from) {
  12. int zhCount = getSplitedZhWords(from).length;
  13. int enCount = getSplitedEnWords(from).length;
  14. Timber.d("zhCount %d | enCount %d", zhCount, enCount);
  15. return zhCount + enCount;
  16. }
  17.  
  18. public static String getZhWords(String from) {
  19. return from
  20. .replaceAll(String.format(REGEX_WORD_EXPECTED, REGEX_NOT + REGEX_CHINESE), "")
  21. .replaceAll("", " ")
  22. .replaceAll(REGEX_WHITESPACE, " ")
  23. .trim();
  24. }
  25.  
  26. public static String getEnWords(String from) {
  27. return from
  28. .replaceAll(String.format(REGEX_WORD_EXPECTED, REGEX_NOT + REGEX_NUMERIC + REGEX_LOWERCASE + REGEX_UPPERCASE), " ")
  29. .replaceAll(String.format(REGEX_WORD_EXPECTED, REGEX_NOT + REGEX_ENGLISH), " ")
  30. .replaceAll(REGEX_WHITESPACE, " ")
  31. .trim();
  32. }
  33.  
  34. public static String[] getSplitedZhWords(String from) {
  35. final String zhWords = getZhWords(from);
  36. Timber.d("zhWords \n%s\n", zhWords);
  37. return (TextUtils.isEmpty(zhWords)) ? new String[0] : zhWords.split(REGEX_WHITESPACE);
  38. }
  39.  
  40. public static String[] getSplitedEnWords(String from) {
  41. final String enWords = getEnWords(from);
  42. Timber.d("enWords \n%s\n", enWords);
  43. return (TextUtils.isEmpty(enWords)) ? new String[0] : enWords.split(REGEX_WHITESPACE);
  44. }
  45.  
  46.  
  47. public static int getWordCounts(String find, String from) {
  48. return getWordCounts(false, find, from);
  49. }
  50.  
  51. public static int getWordCounts(boolean ignoreCase, String find, String from) {
  52. if (ignoreCase) find = find.toLowerCase(Locale.ENGLISH);
  53. if (ignoreCase) from = from.toLowerCase(Locale.ENGLISH);
  54. final ArrayList<String> fromList = new ArrayList<>();
  55. fromList.addAll(Arrays.asList(getSplitedZhWords(from)));
  56. fromList.addAll(Arrays.asList(getSplitedEnWords(from)));
  57. return Collections.frequency(fromList, find);
  58. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement