/* * The MIT License (MIT) * * Copyright © 2016 Emily Mabrey * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package org.emabrey.stackoverflow.java; import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.BreakIterator; import java.text.Normalizer; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Scanner; /** * Example class for use in answering this StackOverflow question * on getting the length of a String. This class requires a file named "unicodetest.txt" be placed alongside it in * the same package as a JAR resource; that file should contain the contents of the Quick Brown Unicode file available * here. * * @see http://stackoverflow.com/a/41309320/2446574 * @see http://www.cl.cam.ac.uk/~mgk25/ucs/examples/quickbrown.txt * * @author Emily Mabrey */ public class StringLengthExample { public static void main(String[] args) { final String inputQuickBrownString = convertStreamToString(StringLengthExample.class.getResourceAsStream("unicodetest.txt"), StandardCharsets.UTF_8); final String nfcNormalizedString = Normalizer.normalize(inputQuickBrownString, Normalizer.Form.NFC); final String nfdNormalizedString = Normalizer.normalize(inputQuickBrownString, Normalizer.Form.NFD); System.out.println("Input UTF-8 String"); printAllLengths(inputQuickBrownString); System.out.println("NFC Normalized UTF-8 String"); printAllLengths(nfcNormalizedString); System.out.println("NFD Normalized UTF-8 String"); printAllLengths(nfdNormalizedString); } private static void printAllLengths(String string) { BreakIterator englishBreaks = BreakIterator.getCharacterInstance(Locale.ENGLISH); englishBreaks.setText(string); List graphemes = new ArrayList<>(string.length()); while (englishBreaks.next() != BreakIterator.DONE) { graphemes.add(englishBreaks.current()); } System.out.println(String.format(">>\tString.length() = %d", string.length())); System.out.println(String.format(">>\tString.codePointCount(int,int) = %d", string.codePointCount(0, string.length()))); System.out.println(String.format(">>\tBreakIterator.getCharacterInstance(Locale) = %d", graphemes.size())); } private static String convertStreamToString(InputStream is, Charset c) { return new Scanner(is, c.name()).useDelimiter("\\A").next(); } }