Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //найти конструкцию вида:
- //<слово><знак_препинания><знак_табуляции>
- //предварительно разбив текст на предложения и отделив междометия: ([А!] [У!] [А?] [О!] [А. С. Пушкин])
- import org.omg.PortableInterceptor.SYSTEM_EXCEPTION;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.PrintStream;
- import java.util.Scanner;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class check {
- public static void form_xml(String entire_text) {
- String xml_header = "<?xml version=\"1.1\" encoding=\"UTF-8\" ?>\n<lab2_Belyaev>\n";
- String xml_footer = "</lab2_AB>";
- int n = 0, found_in_string;
- Pattern p0 = Pattern.compile("(\\w{2,}[\\.|\\!|\\?])|(\\w[\\!|\\?])", Pattern.CASE_INSENSITIVE);
- Matcher m0 = p0.matcher(entire_text);
- StringBuilder newstr = new StringBuilder(entire_text);
- String preedited_str;
- //find all non-abbreviation words in the end of sentences and replace end-of-string charachters with '!'
- while (m0.find()) {
- String tmp = m0.group();
- //tmp = tmp.replaceAll("\\.|\\!|\\?", "!");
- //System.out.println("Abbrev[" + m0.start() + "]:" + '"' + tmp + '"');
- newstr.setCharAt(m0.start() + tmp.length() - 1, '!');
- }
- preedited_str = newstr.toString().replace("\n", "");
- String[] sentences = preedited_str.split("\\!");
- for (String entry : sentences) {
- System.out.println("S:" + '"' + entry + '"');
- }
- StringBuffer buffer = new StringBuffer();
- buffer.append(xml_header);
- StringBuffer tmp_entry = new StringBuffer();
- StringBuffer tmp_word = new StringBuffer();
- StringBuffer tmp_sym = new StringBuffer();
- StringBuffer tmp_tab = new StringBuffer();
- for (String entry : sentences) {
- String regex = "(\\w{1,}\\W\\t+)";
- String inner_regex;
- Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
- Matcher m = p.matcher(entry);
- found_in_string = 0;
- while (m.find()) {
- tmp_entry.append(m.group());
- if (found_in_string == 0) buffer.append("\t<sentence>" + entry + "</sentence>\n");
- inner_regex = "\\w+";
- Pattern p_inner = Pattern.compile(inner_regex, Pattern.CASE_INSENSITIVE);
- Matcher m_inner = p_inner.matcher(tmp_entry);
- while (m_inner.find()) {
- tmp_word.append(m_inner.group());
- }
- inner_regex = "[^\\w^\\s]";
- p_inner = Pattern.compile(inner_regex, Pattern.CASE_INSENSITIVE);
- m_inner = p_inner.matcher(tmp_entry);
- while (m_inner.find()) {
- tmp_sym.append(m_inner.group());
- }
- inner_regex = "\\t+";
- p_inner = Pattern.compile(inner_regex, Pattern.CASE_INSENSITIVE);
- m_inner = p_inner.matcher(tmp_entry);
- while (m_inner.find()) {
- tmp_tab.append(m_inner.group());
- }
- //System.out.println("X:<w>" + tmp_word + "</w> <c>" + tmp_sym + "</c> <tab>" + tmp_tab + "</tab>");
- buffer.append("\t\t<w>").append(tmp_word).append("</w>\n").append("\t\t <c>").append(tmp_sym).append("</c>\n").append("\t\t <tab>").append(tmp_tab).append("</tab>\n\n");
- tmp_entry.delete(0, tmp_entry.length());
- tmp_word.delete(0, tmp_word.length());
- tmp_sym.delete(0, tmp_sym.length());
- tmp_tab.delete(0, tmp_tab.length());
- n++;
- found_in_string++;
- }
- }
- buffer.append(xml_footer);
- System.out.println(buffer);
- System.out.print("Matches:" + n);
- try (PrintStream out =
- new PrintStream(new FileOutputStream("out.xml"))) {
- out.print(buffer);
- }
- catch (java.io.FileNotFoundException e) {
- System.err.println(e.toString());
- }
- }
- public static void main(String[] args){
- try {
- String input_text = new Scanner(new File("src/in.txt")).useDelimiter("\\A").next();
- form_xml(input_text);
- }
- catch (java.io.FileNotFoundException e) {
- System.out.println("E1: file not found");
- System.err.println(e.toString());
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement