Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Index: encoding.c
- ===================================================================
- --- encoding.c (revision 4140)
- +++ encoding.c (working copy)
- @@ -146,6 +146,32 @@ mr_enc_dummy_p(VALUE self, SEL sel)
- return Qfalse;
- }
- +// For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
- +rb_str_t *replacement_string_for_encoding(rb_encoding_t* destination)
- +{
- + rb_str_t *replacement_str = NULL;
- + if (destination == rb_encodings[ENCODING_UTF16BE]) {
- + replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, destination));
- + }
- + else if (destination == rb_encodings[ENCODING_UTF32BE]) {
- + replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, destination));
- + }
- + else if (destination == rb_encodings[ENCODING_UTF16LE]) {
- + replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, destination));
- + }
- + else if (destination == rb_encodings[ENCODING_UTF32LE]) {
- + replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, destination));
- + }
- + else if (destination == rb_encodings[ENCODING_UTF8]) {
- + replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, destination));
- + }
- + else {
- + replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII]));
- + replacement_str = str_simple_transcode(replacement_str, destination);
- + }
- + return replacement_str;
- +}
- +
- static void
- define_encoding_constant(const char *name, rb_encoding_t *encoding)
- {
- @@ -291,6 +317,7 @@ Init_PreEncoding(void)
- add_encoding(ENCODING_BIG5, ENCODING_TYPE_UCNV, "Big5", 1, false, true, "CP950", NULL);
- // FIXME: the ICU conversion tables do not seem to match Ruby's Japanese conversion tables
- add_encoding(ENCODING_EUCJP, ENCODING_TYPE_UCNV, "EUC-JP", 1, false, true, "eucJP", NULL);
- + add_encoding(ENCODING_SJIS, ENCODING_TYPE_UCNV, "Shift_JIS", 1, false, true, "SJIS", NULL);
- //add_encoding(ENCODING_EUCJP, ENCODING_TYPE_RUBY, "EUC-JP", 1, false, true, "eucJP", NULL);
- //add_encoding(ENCODING_SJIS, ENCODING_TYPE_RUBY, "Shift_JIS", 1, false, true, "SJIS", NULL);
- //add_encoding(ENCODING_CP932, ENCODING_TYPE_RUBY, "Windows-31J", 1, false, true, "CP932", "csWindows31J", NULL);
- Index: encoding.h
- ===================================================================
- --- encoding.h (revision 4140)
- +++ encoding.h (working copy)
- @@ -148,7 +148,7 @@ enum {
- ENCODING_MACCYRILLIC,
- ENCODING_BIG5,
- ENCODING_EUCJP,
- - //ENCODING_SJIS,
- + ENCODING_SJIS,
- //ENCODING_CP932,
- ENCODINGS_COUNT
- @@ -293,6 +293,40 @@ str_set_valid_encoding(rb_str_t *self, bool status)
- STRING_VALID_ENCODING);
- }
- +typedef enum {
- + TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
- + TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING,
- + TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT,
- + TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR
- +} transcode_behavior_t;
- +
- +typedef enum {
- + ECONV_INVALID_MASK = 1,
- + ECONV_INVALID_REPLACE = 1 << 1,
- + ECONV_UNDEF_MASK = 1 << 2,
- + ECONV_UNDEF_REPLACE = 1 << 3,
- + ECONV_UNDEF_HEX_CHARREF = 1 << 4,
- + ECONV_PARTIAL_INPUT = 1 << 5,
- + ECONV_AFTER_OUTPUT = 1 << 6,
- + ECONV_UNIVERSAL_NEWLINE_DECORATOR = 1 << 7,
- + ECONV_CRLF_NEWLINE_DECORATOR = 1 << 8,
- + ECONV_CR_NEWLINE_DECORATOR = 1 << 9,
- + ECONV_XML_TEXT_DECORATOR = 1 << 10,
- + ECONV_XML_ATTR_CONTENT_DECORATOR = 1 << 11,
- + ECONV_XML_ATTR_QUOTE_DECORATOR = 1 << 12
- +} transcode_flags_t;
- +
- +rb_str_t *str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
- + int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str);
- +
- +static inline rb_str_t *
- +str_simple_transcode(rb_str_t *self, rb_encoding_t *dst_encoding)
- +{
- + return str_transcode(self, self->encoding, dst_encoding,
- + TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
- +}
- +
- +
- void rb_str_NSCoder_encode(void *coder, VALUE str, const char *key);
- VALUE rb_str_NSCoder_decode(void *coder, const char *key);
- @@ -319,6 +353,10 @@ unsigned long rb_str_hash_uchars(const UChar *chars, long chars_len);
- long rb_uchar_strtol(UniChar *chars, long chars_len, long pos,
- long *end_offset);
- void rb_str_force_encoding(VALUE str, rb_encoding_t *encoding);
- +rb_str_t *str_need_string(VALUE str);
- +rb_str_t *replacement_string_for_encoding(rb_encoding_t* enc);
- +void str_replace_with_string(rb_str_t *self, rb_str_t *source);
- +
- #if defined(__cplusplus)
- } // extern "C"
- Index: inits.c
- ===================================================================
- --- inits.c (revision 4140)
- +++ inits.c (working copy)
- @@ -58,6 +58,7 @@ void Init_ObjC(void);
- void Init_BridgeSupport(void);
- void Init_FFI(void);
- void Init_Dispatch(void);
- +void Init_Transcode(void);
- void Init_PostVM(void);
- void
- @@ -110,5 +111,6 @@ rb_call_inits()
- Init_BridgeSupport();
- Init_FFI();
- Init_Dispatch();
- + Init_Transcode();
- Init_PostVM();
- }
- Index: rakelib/builder/builder.rb
- ===================================================================
- --- rakelib/builder/builder.rb (revision 4140)
- +++ rakelib/builder/builder.rb (working copy)
- @@ -6,7 +6,7 @@ OBJS = %w{
- random range rational re ruby signal sprintf st string struct time
- util variable version thread id objc bs ucnv encoding main dln dmyext marshal
- gcd vm_eval gc-stub bridgesupport compiler dispatcher vm symbol debugger MacRuby
- - MacRubyDebuggerConnector NSArray NSDictionary NSString
- + MacRubyDebuggerConnector NSArray NSDictionary NSString transcode
- }
- EXTENSIONS = %w{
- Index: spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt
- ===================================================================
- --- spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt (revision 4140)
- +++ spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt (working copy)
- @@ -1,7 +1,4 @@
- -fails:Encoding::Converter.asciicompat_encoding accepts an encoding name as a String argument
- fails:Encoding::Converter.asciicompat_encoding coerces non-String/Encoding objects with #to_str
- fails:Encoding::Converter.asciicompat_encoding accepts an Encoding object as an argument
- fails:Encoding::Converter.asciicompat_encoding returns a corresponding ASCII compatible encoding for ASCII-incompatible encodings
- -fails:Encoding::Converter.asciicompat_encoding returns nil when the given encoding is ASCII compatible
- fails:Encoding::Converter.asciicompat_encoding handles encoding names who resolve to nil encodings
- -fails:Encoding::Converter.asciicompat_encoding returns nil if called with an encoding it returned previously
- Index: spec/frozen/tags/macruby/core/encoding/converter/constants_tags.txt
- deleted file mode 100644
- ===================================================================
- --- spec/frozen/tags/macruby/core/encoding/converter/constants_tags.txt (revision 4140)
- +++ /dev/null (working copy)
- @@ -1,26 +0,0 @@
- -fails:Encoding::Converter::INVALID_MASK exists
- -fails:Encoding::Converter::INVALID_MASK has a Fixnum value
- -fails:Encoding::Converter::INVALID_REPLACE exists
- -fails:Encoding::Converter::INVALID_REPLACE has a Fixnum value
- -fails:Encoding::Converter::UNDEF_MASK exists
- -fails:Encoding::Converter::UNDEF_MASK has a Fixnum value
- -fails:Encoding::Converter::UNDEF_REPLACE exists
- -fails:Encoding::Converter::UNDEF_REPLACE has a Fixnum value
- -fails:Encoding::Converter::UNDEF_HEX_CHARREF exists
- -fails:Encoding::Converter::UNDEF_HEX_CHARREF has a Fixnum value
- -fails:Encoding::Converter::PARTIAL_INPUT exists
- -fails:Encoding::Converter::PARTIAL_INPUT has a Fixnum value
- -fails:Encoding::Converter::AFTER_OUTPUT exists
- -fails:Encoding::Converter::AFTER_OUTPUT has a Fixnum value
- -fails:Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR exists
- -fails:Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR has a Fixnum value
- -fails:Encoding::Converter::CRLF_NEWLINE_DECORATOR exists
- -fails:Encoding::Converter::CRLF_NEWLINE_DECORATOR has a Fixnum value
- -fails:Encoding::Converter::CR_NEWLINE_DECORATOR exists
- -fails:Encoding::Converter::CR_NEWLINE_DECORATOR has a Fixnum value
- -fails:Encoding::Converter::XML_TEXT_DECORATOR exists
- -fails:Encoding::Converter::XML_TEXT_DECORATOR has a Fixnum value
- -fails:Encoding::Converter::XML_ATTR_CONTENT_DECORATOR exists
- -fails:Encoding::Converter::XML_ATTR_CONTENT_DECORATOR has a Fixnum value
- -fails:Encoding::Converter::XML_ATTR_QUOTE_DECORATOR exists
- -fails:Encoding::Converter::XML_ATTR_QUOTE_DECORATOR has a Fixnum value
- Index: spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt
- ===================================================================
- --- spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt (revision 4140)
- +++ spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt (working copy)
- @@ -1,7 +1,2 @@
- -fails:Encoding::Converter#convert returns a String
- -fails:Encoding::Converter#convert sets the encoding of the result to the target encoding
- -fails:Encoding::Converter#convert transcodes the given String to the target encoding
- fails:Encoding::Converter#convert allows Strings of different encodings to the source encoding
- -fails:Encoding::Converter#convert reuses the given encoding pair if called multiple times
- -fails:Encoding::Converter#convert raises UndefinedConversionError if the String contains characters invalid for the target encoding
- -fails:Encoding::Converter#convert raises an ArgumentError if called on a finished stream
- +
- Index: spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt
- ===================================================================
- --- spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt (revision 4140)
- +++ spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt (working copy)
- @@ -1,7 +1,2 @@
- -fails:Encoding::Converter#convpath returns an Array
- -fails:Encoding::Converter#convpath returns each encoding pair as a sub-Array
- -fails:Encoding::Converter#convpath returns each encoding as an Encoding object
- fails:Encoding::Converter#convpath returns multiple encoding pairs when direct conversion is impossible
- -fails:Encoding::Converter#convpath sets the last element of each pair to the first element of the next
- -fails:Encoding::Converter#convpath only lists a source encoding once
- fails:Encoding::Converter#convpath indicates if crlf_newline conversion would occur
- Index: spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt
- ===================================================================
- --- spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt (revision 4140)
- +++ spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt (working copy)
- @@ -1 +1 @@
- -fails:Encoding::Converter#destination_encoding returns the destination encoding as an Encoding object
- +
- Index: spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt
- ===================================================================
- --- spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt (revision 4140)
- +++ spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt (working copy)
- @@ -1,8 +1,3 @@
- fails:Encoding::Converter#replacement returns '?' in US-ASCII when the destination encoding is not UTF-8
- -fails:Encoding::Converter#replacement returns � when the destination encoding is UTF-8
- -fails:Encoding::Converter#replacement= accepts a String argument
- -fails:Encoding::Converter#replacement= accepts a String argument of arbitrary length
- -fails:Encoding::Converter#replacement= raises an TypeError if assigned a non-String argument
- -fails:Encoding::Converter#replacement= sets #replacement
- fails:Encoding::Converter#replacement= raises an UndefinedConversionError is the argument cannot be converted into the destination encoding
- fails:Encoding::Converter#replacement= does not change the replacement character if the argument cannot be converted into the destination encoding
- Index: spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt
- ===================================================================
- --- spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt (revision 4140)
- +++ spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt (working copy)
- @@ -1,8 +1,3 @@
- -fails:Encoding::Converter.search_convpath returns an Array
- -fails:Encoding::Converter.search_convpath returns each encoding pair as a sub-Array
- -fails:Encoding::Converter.search_convpath returns each encoding as an Encoding object
- fails:Encoding::Converter.search_convpath returns multiple encoding pairs when direct conversion is impossible
- -fails:Encoding::Converter.search_convpath sets the last element of each pair to the first element of the next
- -fails:Encoding::Converter.search_convpath only lists a source encoding once
- fails:Encoding::Converter.search_convpath indicates if crlf_newline conversion would occur
- fails:Encoding::Converter.search_convpath raises an Encoding::ConverterNotFoundError if no conversion path exists
- Index: spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt
- ===================================================================
- --- spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt (revision 4140)
- +++ spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt (working copy)
- @@ -1 +1 @@
- -fails:Encoding::Converter#source_encoding returns the source encoding as an Encoding object
- +
- Index: string.c
- ===================================================================
- --- string.c (revision 4140)
- +++ string.c (working copy)
- @@ -251,7 +251,7 @@ str_replace_with_bytes(rb_str_t *self, const char *bytes, long len,
- }
- }
- -static void
- +void
- str_replace_with_string(rb_str_t *self, rb_str_t *source)
- {
- if (self == source) {
- @@ -1118,7 +1118,7 @@ str_include_string(rb_str_t *self, rb_str_t *searched)
- self->length_in_bytes, true) != -1;
- }
- -static rb_str_t *
- +rb_str_t *
- str_need_string(VALUE str)
- {
- switch (TYPE(str)) {
- @@ -1247,24 +1247,6 @@ rstr_append(VALUE str, VALUE substr)
- }
- }
- -enum {
- - TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
- - TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING,
- - TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT,
- - TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR
- -};
- -
- -
- -static rb_str_t *
- -str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
- - int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str);
- -static inline rb_str_t *
- -str_simple_transcode(rb_str_t *self, rb_encoding_t *dst_encoding)
- -{
- - return str_transcode(self, self->encoding, dst_encoding,
- - TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
- -}
- -
- static void inline
- str_concat_ascii_cstr(rb_str_t *self, char *cstr)
- {
- @@ -1280,7 +1262,7 @@ str_concat_ascii_cstr(rb_str_t *self, char *cstr)
- }
- }
- -static rb_str_t *
- +rb_str_t *
- str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
- int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str)
- {
- @@ -1844,165 +1826,6 @@ rstr_is_ascii_only(VALUE self, SEL sel)
- return str_is_ruby_ascii_only(RSTR(self)) ? Qtrue : Qfalse;
- }
- -/*
- - * call-seq:
- - * str.encode(encoding [, options] ) => str
- - * str.encode(dst_encoding, src_encoding [, options] ) => str
- - * str.encode([options]) => str
- - *
- - * The first form returns a copy of <i>str</i> transcoded
- - * to encoding +encoding+.
- - * The second form returns a copy of <i>str</i> transcoded
- - * from src_encoding to dst_encoding.
- - * The last form returns a copy of <i>str</i> transcoded to
- - * <code>Encoding.default_internal</code>.
- - * By default, the first and second form raise
- - * Encoding::UndefinedConversionError for characters that are
- - * undefined in the destination encoding, and
- - * Encoding::InvalidByteSequenceError for invalid byte sequences
- - * in the source encoding. The last form by default does not raise
- - * exceptions but uses replacement strings.
- - * The <code>options</code> Hash gives details for conversion.
- - *
- - * === options
- - * The hash <code>options</code> can have the following keys:
- - * :invalid ::
- - * If the value is <code>:replace</code>, <code>#encode</code> replaces
- - * invalid byte sequences in <code>str</code> with the replacement character.
- - * The default is to raise the exception
- - * :undef ::
- - * If the value is <code>:replace</code>, <code>#encode</code> replaces
- - * characters which are undefined in the destination encoding with
- - * the replacement character.
- - * :replace ::
- - * Sets the replacement string to the value. The default replacement
- - * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
- - * :xml ::
- - * The value must be <code>:text</code> or <code>:attr</code>.
- - * If the value is <code>:text</code> <code>#encode</code> replaces
- - * undefined characters with their (upper-case hexadecimal) numeric
- - * character references. '&', '<', and '>' are converted to "&",
- - * "<", and ">", respectively.
- - * If the value is <code>:attr</code>, <code>#encode</code> also quotes
- - * the replacement result (using '"'), and replaces '"' with """.
- - */
- -extern rb_encoding_t *default_internal;
- -static VALUE
- -rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
- -{
- - VALUE opt = Qnil;
- - if (argc > 0) {
- - opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
- - if (!NIL_P(opt)) {
- - argc--;
- - }
- - }
- -
- - rb_str_t *self = RSTR(str);
- - rb_str_t *replacement_str = NULL;
- - rb_encoding_t *src_encoding, *dst_encoding;
- - int behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
- - int behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
- - if (argc == 0) {
- - src_encoding = self->encoding;
- - dst_encoding = default_internal;
- - behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- - }
- - else if (argc == 1) {
- - src_encoding = self->encoding;
- - dst_encoding = rb_to_encoding(argv[0]);
- - }
- - else if (argc == 2) {
- - dst_encoding = rb_to_encoding(argv[0]);
- - src_encoding = rb_to_encoding(argv[1]);
- - }
- - else {
- - rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
- - }
- -
- - if (!NIL_P(opt)) {
- - VALUE invalid_val = rb_hash_aref(opt, ID2SYM(rb_intern("invalid")));
- - VALUE replace_sym = ID2SYM(rb_intern("replace"));
- - if (invalid_val == replace_sym) {
- - behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- - }
- - VALUE undefined_val = rb_hash_aref(opt, ID2SYM(rb_intern("undefined")));
- - if (undefined_val == replace_sym) {
- - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- - }
- - VALUE xml_val = rb_hash_aref(opt, ID2SYM(rb_intern("xml")));
- - if (xml_val == ID2SYM(rb_intern("text"))) {
- - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
- - }
- - else if (xml_val == ID2SYM(rb_intern("attr"))) {
- - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
- - }
- -
- - VALUE replacement = rb_hash_aref(opt, replace_sym);
- - if (!NIL_P(replacement)) {
- - replacement_str = str_need_string(replacement);
- - if ((replacement_str->encoding != dst_encoding) && (replacement_str->length_in_bytes > 0)) {
- - replacement_str = str_simple_transcode(replacement_str, dst_encoding);
- - }
- - if ((behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
- - && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
- - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- - }
- - }
- - }
- -
- - if ((replacement_str == NULL)
- - && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
- - || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
- - if (dst_encoding == rb_encodings[ENCODING_UTF16BE]) {
- - replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, dst_encoding));
- - }
- - else if (dst_encoding == rb_encodings[ENCODING_UTF32BE]) {
- - replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, dst_encoding));
- - }
- - else if (dst_encoding == rb_encodings[ENCODING_UTF16LE]) {
- - replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, dst_encoding));
- - }
- - else if (dst_encoding == rb_encodings[ENCODING_UTF32LE]) {
- - replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, dst_encoding));
- - }
- - else if (dst_encoding == rb_encodings[ENCODING_UTF8]) {
- - replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, dst_encoding));
- - }
- - else {
- - replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII]));
- - replacement_str = str_simple_transcode(replacement_str, dst_encoding);
- - }
- - }
- -
- - return (VALUE)str_transcode(self, src_encoding, dst_encoding,
- - behavior_for_invalid, behavior_for_undefined, replacement_str);
- -}
- -
- -/*
- - * call-seq:
- - * str.encode!(encoding [, options] ) => str
- - * str.encode!(dst_encoding, src_encoding [, options] ) => str
- - *
- - * The first form transcodes the contents of <i>str</i> from
- - * str.encoding to +encoding+.
- - * The second form transcodes the contents of <i>str</i> from
- - * src_encoding to dst_encoding.
- - * The options Hash gives details for conversion. See String#encode
- - * for details.
- - * Returns the string even if no changes were made.
- - */
- -static VALUE
- -rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
- -{
- - rstr_modify(str);
- -
- - VALUE new_str = rstr_encode(str, sel, argc, argv);
- - str_replace_with_string(RSTR(str), RSTR(new_str));
- - return str;
- -}
- -
- /*
- * call-seq:
- @@ -5958,8 +5781,6 @@ Init_String(void)
- rb_objc_define_method(rb_cRubyString, "partition", rstr_partition, 1);
- rb_objc_define_method(rb_cRubyString, "rpartition", rstr_rpartition, 1);
- rb_objc_define_method(rb_cRubyString, "crypt", rstr_crypt, 1);
- - rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
- - rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
- // MacRuby extensions.
- rb_objc_define_method(rb_cRubyString, "transform", rstr_transform, 1);
- Index: transcode.c
- new file mode 100644
- ===================================================================
- --- /dev/null (revision 4140)
- +++ transcode.c (working copy)
- @@ -0,0 +1,450 @@
- +/*
- + * MacRuby implementation of transcode.c.
- + *
- + * This file is covered by the Ruby license. See COPYING for more details.
- + *
- + * Copyright (C) 2007-2010, Apple Inc. All rights reserved.
- + * Copyright (C) 1993-2007 Yukihiro Matsumoto
- + * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
- + * Copyright (C) 2000 Information-technology Promotion Agency, Japan
- + */
- +
- +// Notes:
- +// AFAICT, we need to add support for newline decorators.
- +
- +#include "ruby.h"
- +#include "ruby/encoding.h"
- +#include "encoding.h"
- +
- +static VALUE sym_invalid;
- +static VALUE sym_undef;
- +static VALUE sym_replace;
- +static VALUE sym_xml;
- +static VALUE sym_text;
- +static VALUE sym_attr;
- +
- +typedef struct rb_econv_s {
- + rb_encoding_t *source;
- + rb_encoding_t *destination;
- + transcode_behavior_t invalid_sequence_behavior;
- + transcode_behavior_t undefined_conversion_behavior;
- + transcode_flags_t special_flags;
- + rb_str_t *replacement;
- + bool finished;
- +} rb_econv_t;
- +
- +VALUE rb_cEncodingConverter;
- +
- +static rb_econv_t* RConverter(VALUE self) {
- + rb_econv_t *conv;
- + Data_Get_Struct(self, rb_econv_t, conv);
- + return conv;
- +}
- +
- +static VALUE
- +rb_econv_alloc(VALUE klass, SEL sel)
- +{
- + rb_econv_t *conv = ALLOC(rb_econv_t);
- + conv->source = NULL;
- + conv->destination = NULL;
- + conv->replacement = NULL;
- + conv->special_flags = 0;
- + conv->finished = false;
- + return Data_Wrap_Struct(klass, 0, 0, conv);
- +}
- +
- +static VALUE
- +rb_econv_asciicompat_encoding(VALUE klass, SEL sel, VALUE arg)
- +{
- + rb_encoding_t *enc = NULL;
- + if (CLASS_OF(arg) == rb_cEncoding) {
- + enc = rb_to_encoding(arg);
- + }
- + else {
- + StringValue(arg);
- + enc = rb_enc_find(RSTRING_PTR(arg));
- + }
- +
- + if ((enc == NULL) || (enc->ascii_compatible)) {
- + return Qnil;
- + }
- + else if (UTF16_ENC(enc) || UTF32_ENC(enc)) {
- + return (VALUE)rb_utf8_encoding();
- + }
- + // TODO: Port MRI's table that maps ASCII-incompatible encodings to compatible ones.
- + rb_raise(rb_eConverterNotFoundError, "could not find ASCII-compatible encoding for %s", enc->public_name);
- +}
- +
- +static VALUE rb_econv_convpath(VALUE self, SEL sel);
- +
- +static VALUE
- +rb_econv_search_convpath(VALUE klass, SEL sel, int argc, VALUE* argv)
- +{
- + return rb_econv_convpath(rb_class_new_instance(argc, argv, klass), sel);
- +}
- +
- +static transcode_behavior_t
- +symbol_option_with_default(VALUE given_symbol, transcode_behavior_t otherwise, const char* name)
- +{
- + if (given_symbol == sym_replace) {
- + return TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- + }
- + else if (given_symbol == sym_attr) {
- + return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
- + }
- + else if (given_symbol == sym_text) {
- + return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
- + }
- + else if (!NIL_P(given_symbol)) {
- + rb_raise(rb_eArgError, "unknown value '%s' for option %s", StringValuePtr(given_symbol), name);
- + }
- + return otherwise;
- +}
- +
- +static void parse_conversion_options(VALUE options, transcode_behavior_t* behavior_for_invalid,
- + transcode_behavior_t* behavior_for_undefined, rb_str_t** replacement_str, rb_encoding_t* destination)
- +{
- +
- + *behavior_for_invalid = symbol_option_with_default(rb_hash_aref(options, sym_invalid),
- + TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "invalid-character");
- +
- + *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_undef),
- + TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "undefined-conversion");
- +
- + // Because the API conflates the :xml and :undef options, we pass in the previous setting
- + *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
- + *behavior_for_undefined, "xml-replacement");
- +
- + *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
- + *behavior_for_undefined, "xml-replacement");
- +
- + VALUE replacement = rb_hash_aref(options, sym_replace);
- + if (!NIL_P(replacement)) {
- + *replacement_str = str_simple_transcode(str_need_string(replacement), destination);
- + }
- +
- +}
- +
- +static VALUE
- +rb_econv_initialize(VALUE self, SEL sel, int argc, VALUE* argv)
- +{
- + rb_econv_t *conv = RConverter(self);
- + VALUE sourceobj, destobj, options;
- + rb_scan_args(argc, argv, "21", &sourceobj, &destobj, &options);
- +
- + rb_encoding_t* source = rb_to_encoding(sourceobj);
- + rb_encoding_t* destination = rb_to_encoding(destobj);
- + rb_str_t* replacement_str = NULL;
- +
- + conv->source = source;
- + conv->destination = destination;
- +
- + conv->invalid_sequence_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
- + conv->undefined_conversion_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
- +
- + // Extract the options. This is a hateful, hateful API.
- + if (!NIL_P(options)) {
- +
- + if (FIXNUM_P(options)) {
- + rb_bug("fixnum arguments are not supported yet.");
- + }
- + else if (TYPE(options) == T_HASH) {
- + parse_conversion_options(options, &conv->invalid_sequence_behavior,
- + &conv->undefined_conversion_behavior, &replacement_str, destination);
- + }
- + else {
- + rb_raise(rb_eArgError, "expected either a hash or a fixnum as the last parameter");
- + }
- + }
- +
- + // Get the default replacement string. For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
- + if (replacement_str == NULL) {
- + replacement_str = replacement_string_for_encoding(destination);
- + }
- + GC_WB(&conv->replacement, replacement_str);
- +
- + return self;
- +}
- +
- +static VALUE
- +rb_econv_inspect(VALUE self, SEL sel)
- +{
- + // TODO: make this comply with the MRI output when we add newline decorators
- + rb_econv_t *conv = RConverter(self);
- + return rb_sprintf("#<%s: %s to %s>", rb_obj_classname(self), conv->source->public_name,
- + conv->destination->public_name);
- +}
- +
- +static VALUE
- +rb_econv_convpath(VALUE self, SEL sel)
- +{
- + // in MacRuby, the convpath always looks like this:
- + // [[source_encoding, native UTF-16], [native UTF-16, dest_encoding]]
- + // The first element is omitted if the source encoding is UTF-16, obviously.
- + rb_econv_t *conv = RConverter(self);
- + VALUE to_return = rb_ary_new2(2);
- + rb_encoding_t* nativeUTF16 = rb_encodings[ENCODING_UTF16_NATIVE];
- +
- + if (conv->source != nativeUTF16) {
- + rb_ary_push(to_return, rb_assoc_new((VALUE)conv->source, (VALUE)nativeUTF16));
- + }
- +
- + rb_ary_push(to_return, rb_assoc_new((VALUE)nativeUTF16, (VALUE)conv->destination));
- +
- + return to_return;
- +}
- +
- +static VALUE
- +rb_econv_source_encoding(VALUE self, SEL sel)
- +{
- + return (VALUE)(RConverter(self)->source);
- +}
- +
- +static VALUE
- +rb_econv_destination_encoding(VALUE self, SEL sel)
- +{
- + return (VALUE)(RConverter(self)->destination);
- +}
- +
- +// Since our converter is basically a black box at this point, we'll leave
- +// the lower-level methods unimplemented.
- +#define rb_econv_primitive_convert rb_f_notimplement
- +
- +static VALUE
- +rb_econv_convert(VALUE self, SEL sel, VALUE str)
- +{
- + rb_econv_t *conv;
- + Data_Get_Struct(self, rb_econv_t, conv);
- +
- + if (conv->finished) {
- + rb_raise(rb_eArgError, "convert() called on a finished stream");
- + }
- +
- + assert(conv->replacement->encoding == conv->destination);
- + return (VALUE)str_transcode(str_need_string(str), conv->source, conv->destination, conv->invalid_sequence_behavior, conv->undefined_conversion_behavior, conv->replacement);
- +}
- +
- +static VALUE
- +rb_econv_finish(VALUE self, SEL sel)
- +{
- + // TODO: Flesh this out later.
- + RConverter(self)->finished = true;
- + return rb_str_new2("");
- +}
- +
- +#define rb_econv_primitive_errinfo rb_f_notimplement
- +
- +#define rb_econv_insert_output rb_f_notimplement
- +
- +#define rb_econv_putback rb_f_notimplement
- +
- +#define rb_econv_last_error rb_f_notimplement
- +
- +static VALUE
- +rb_econv_replacement(VALUE self, SEL sel)
- +{
- + return (VALUE)(RConverter(self)->replacement);
- +}
- +
- +static VALUE
- +rb_econv_set_replacement(VALUE self, SEL sel, VALUE str)
- +{
- + // TODO: Should we copy this string? Probably.
- + rb_econv_t *conv = RConverter(self);
- + if (TYPE(str) != T_STRING) {
- + rb_raise(rb_eTypeError, "wrong argument type %s (expected String)", rb_obj_classname(str));
- + }
- + rb_str_force_encoding(str, conv->destination);
- + GC_WB(&conv->replacement, str_need_string(str));
- + return str;
- +}
- +
- +/*
- + * call-seq:
- + * str.encode(encoding [, options] ) => str
- + * str.encode(dst_encoding, src_encoding [, options] ) => str
- + * str.encode([options]) => str
- + *
- + * The first form returns a copy of <i>str</i> transcoded
- + * to encoding +encoding+.
- + * The second form returns a copy of <i>str</i> transcoded
- + * from src_encoding to dst_encoding.
- + * The last form returns a copy of <i>str</i> transcoded to
- + * <code>Encoding.default_internal</code>.
- + * By default, the first and second form raise
- + * Encoding::UndefinedConversionError for characters that are
- + * undefined in the destination encoding, and
- + * Encoding::InvalidByteSequenceError for invalid byte sequences
- + * in the source encoding. The last form by default does not raise
- + * exceptions but uses replacement strings.
- + * The <code>options</code> Hash gives details for conversion.
- + *
- + * === options
- + * The hash <code>options</code> can have the following keys:
- + * :invalid ::
- + * If the value is <code>:replace</code>, <code>#encode</code> replaces
- + * invalid byte sequences in <code>str</code> with the replacement character.
- + * The default is to raise the exception
- + * :undef ::
- + * If the value is <code>:replace</code>, <code>#encode</code> replaces
- + * characters which are undefined in the destination encoding with
- + * the replacement character.
- + * :replace ::
- + * Sets the replacement string to the value. The default replacement
- + * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
- + * :xml ::
- + * The value must be <code>:text</code> or <code>:attr</code>.
- + * If the value is <code>:text</code> <code>#encode</code> replaces
- + * undefined characters with their (upper-case hexadecimal) numeric
- + * character references. '&', '<', and '>' are converted to "&",
- + * "<", and ">", respectively.
- + * If the value is <code>:attr</code>, <code>#encode</code> also quotes
- + * the replacement result (using '"'), and replaces '"' with """.
- + */
- +extern rb_encoding_t *default_internal;
- +static VALUE
- +rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
- +{
- + VALUE opt = Qnil;
- + if (argc > 0) {
- + opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
- + if (!NIL_P(opt)) {
- + argc--;
- + }
- + }
- +
- + rb_str_t *self = RSTR(str);
- + rb_str_t *replacement_str = NULL;
- + rb_encoding_t *src_encoding, *dst_encoding;
- + transcode_behavior_t behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
- + transcode_behavior_t behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
- + if (argc == 0) {
- + src_encoding = self->encoding;
- + dst_encoding = default_internal;
- + behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- + behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- + }
- + else if (argc == 1) {
- + src_encoding = self->encoding;
- + dst_encoding = rb_to_encoding(argv[0]);
- + }
- + else if (argc == 2) {
- + dst_encoding = rb_to_encoding(argv[0]);
- + src_encoding = rb_to_encoding(argv[1]);
- + }
- + else {
- + rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
- + }
- +
- + if (!NIL_P(opt)) {
- + parse_conversion_options(opt, &behavior_for_invalid, &behavior_for_undefined, &replacement_str, dst_encoding);
- + if ((replacement_str != NULL)
- + && (behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
- + && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
- + behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- + }
- + }
- +
- + if ((replacement_str == NULL)
- + && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
- + || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
- + replacement_str = replacement_string_for_encoding(dst_encoding);
- + }
- +
- + return (VALUE)str_transcode(self, src_encoding, dst_encoding,
- + behavior_for_invalid, behavior_for_undefined, replacement_str);
- +}
- +
- +/*
- + * call-seq:
- + * str.encode!(encoding [, options] ) => str
- + * str.encode!(dst_encoding, src_encoding [, options] ) => str
- + *
- + * The first form transcodes the contents of <i>str</i> from
- + * str.encoding to +encoding+.
- + * The second form transcodes the contents of <i>str</i> from
- + * src_encoding to dst_encoding.
- + * The options Hash gives details for conversion. See String#encode
- + * for details.
- + * Returns the string even if no changes were made.
- + */
- +static VALUE
- +rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
- +{
- + rstr_modify(str);
- +
- + VALUE new_str = rstr_encode(str, sel, argc, argv);
- + str_replace_with_string(RSTR(str), RSTR(new_str));
- + return str;
- +}
- +
- +void
- +Init_Transcode(void)
- +{
- + rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
- + rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
- + rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
- +
- + rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
- + rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
- +
- + rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
- + rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "alloc", rb_econv_alloc, 0);
- + rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "asciicompat_encoding", rb_econv_asciicompat_encoding, 1);
- + rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "search_convpath", rb_econv_search_convpath, -1);
- +
- + rb_objc_define_method(rb_cEncodingConverter, "initialize", rb_econv_initialize, -1);
- + rb_objc_define_method(rb_cEncodingConverter, "inspect", rb_econv_inspect, 0);
- + rb_objc_define_method(rb_cEncodingConverter, "convpath", rb_econv_convpath, 0);
- + rb_objc_define_method(rb_cEncodingConverter, "source_encoding", rb_econv_source_encoding, 0);
- + rb_objc_define_method(rb_cEncodingConverter, "destination_encoding", rb_econv_destination_encoding, 0);
- + rb_objc_define_method(rb_cEncodingConverter, "primitive_convert", rb_econv_primitive_convert, -1);
- + rb_objc_define_method(rb_cEncodingConverter, "convert", rb_econv_convert, 1);
- + rb_objc_define_method(rb_cEncodingConverter, "finish", rb_econv_finish, 0);
- + rb_objc_define_method(rb_cEncodingConverter, "primitive_errinfo", rb_econv_primitive_errinfo, 0);
- + rb_objc_define_method(rb_cEncodingConverter, "insert_output", rb_econv_insert_output, 1);
- + rb_objc_define_method(rb_cEncodingConverter, "putback", rb_econv_putback, -1);
- + rb_objc_define_method(rb_cEncodingConverter, "last_error", rb_econv_last_error, 0);
- + rb_objc_define_method(rb_cEncodingConverter, "replacement", rb_econv_replacement, 0);
- + rb_objc_define_method(rb_cEncodingConverter, "replacement=", rb_econv_set_replacement, 1);
- +
- + sym_invalid = ID2SYM(rb_intern("invalid"));
- + sym_undef = ID2SYM(rb_intern("undef"));
- + sym_replace = ID2SYM(rb_intern("replace"));
- + sym_attr = ID2SYM(rb_intern("attr"));
- + sym_text = ID2SYM(rb_intern("text"));
- + sym_xml = ID2SYM(rb_intern("xml"));
- +
- + // If only these mapped to the internal enums...
- + rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
- + rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
- + rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
- + rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
- + rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
- + rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
- + rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
- + rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
- + rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
- + rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
- + rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
- + rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
- + rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
- +
- +#if 0
- + rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
- + rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
- + rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
- + rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
- + rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
- +
- + rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
- + rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
- + rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
- + rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
- + rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
- + rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
- + rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
- +
- + Init_newline();
- +#endif
- +}
Add Comment
Please, Sign In to add comment