Pastebin.com

# -*- coding: utf-8 -*-
#Try if parsing in Unicode works
#Tested on x86_64 GNU/Linux (Suse Linux 11.0)

from pyparsing import *

#creating the parser
start_kw = Keyword('ABC')
end_kw = Keyword('XYZ')
the_parser = (start_kw + '|' + CharsNotIn('|') + '|' + end_kw)

#some test texts
text1 = 'ABC | &#12510;&#12523;&#12481; | XYZ'
text2 = 'ABC | &#12481;&#12487;&#12451; | XYZ'
text3 = 'ABC | &#22270;&#24418;&#31561; | XYZ'
text4 = 'ABC | &#24212;&#29992;&#31243; | XYZ'
text5 = 'ABC | ��� | XYZ'
text6 = 'ABC | i�� | XYZ'
bad_1 = 'ABC  &#24212;&#29992;&#31243; | XYZ'
bad_2 = 'ABC | &#24212;&#29992;&#31243;  XYZ'

#function to present input output and errors in a nice way
def parse_text(text):
    print
    print 'Trying to parse: ', text
    try:
        print 'Result: ', the_parser.parseString(text)
    except ParseException, error:
        print '\nParse error!'
        print error

#do some parsing
parse_text(text1)
parse_text(text2)
parse_text(text3)
parse_text(text4)
parse_text(text5)
parse_text(text6)
parse_text(bad_1)
parse_text(bad_2)