diff options
author | Armin Rigo <arigo@tunes.org> | 2011-08-01 16:18:56 +0200 |
---|---|---|
committer | Armin Rigo <arigo@tunes.org> | 2011-08-01 16:18:56 +0200 |
commit | 33e3ca835f82f07dc427efd75bc9bc39e83919a8 (patch) | |
tree | 1f60fe56cbdb96f5329a6e7cca6e4ee20b301840 | |
parent | Add an explicit flag 'add_memory_pressure=True' to the (diff) | |
parent | Add missing 'usemodules'. (diff) | |
download | pypy-33e3ca835f82f07dc427efd75bc9bc39e83919a8.tar.gz pypy-33e3ca835f82f07dc427efd75bc9bc39e83919a8.tar.bz2 pypy-33e3ca835f82f07dc427efd75bc9bc39e83919a8.zip |
merge heads
-rw-r--r-- | lib-python/conftest.py | 26 | ||||
-rw-r--r-- | pypy/module/_multibytecodec/__init__.py | 9 | ||||
-rw-r--r-- | pypy/module/_multibytecodec/app_multibytecodec.py | 77 | ||||
-rw-r--r-- | pypy/module/_multibytecodec/c_codecs.py | 101 | ||||
-rw-r--r-- | pypy/module/_multibytecodec/interp_incremental.py | 141 | ||||
-rw-r--r-- | pypy/module/_multibytecodec/interp_multibytecodec.py | 49 | ||||
-rw-r--r-- | pypy/module/_multibytecodec/test/test_app_incremental.py | 138 | ||||
-rw-r--r-- | pypy/module/_multibytecodec/test/test_app_stream.py | 71 | ||||
-rw-r--r-- | pypy/module/_multibytecodec/test/test_c_codecs.py | 47 | ||||
-rw-r--r-- | pypy/translator/c/src/cjkcodecs/multibytecodec.c | 84 | ||||
-rw-r--r-- | pypy/translator/c/src/cjkcodecs/multibytecodec.h | 11 |
11 files changed, 608 insertions, 146 deletions
diff --git a/lib-python/conftest.py b/lib-python/conftest.py index 09d107d622..84b15d6135 100644 --- a/lib-python/conftest.py +++ b/lib-python/conftest.py @@ -154,18 +154,18 @@ testmap = [ RegrTest('test_cmd.py'), RegrTest('test_cmd_line_script.py'), RegrTest('test_codeccallbacks.py', core=True), - RegrTest('test_codecencodings_cn.py'), - RegrTest('test_codecencodings_hk.py'), - RegrTest('test_codecencodings_jp.py'), - RegrTest('test_codecencodings_kr.py'), - RegrTest('test_codecencodings_tw.py'), - - RegrTest('test_codecmaps_cn.py'), - RegrTest('test_codecmaps_hk.py'), - RegrTest('test_codecmaps_jp.py'), - RegrTest('test_codecmaps_kr.py'), - RegrTest('test_codecmaps_tw.py'), - RegrTest('test_codecs.py', core=True), + RegrTest('test_codecencodings_cn.py', usemodules='_multibytecodec'), + RegrTest('test_codecencodings_hk.py', usemodules='_multibytecodec'), + RegrTest('test_codecencodings_jp.py', usemodules='_multibytecodec'), + RegrTest('test_codecencodings_kr.py', usemodules='_multibytecodec'), + RegrTest('test_codecencodings_tw.py', usemodules='_multibytecodec'), + + RegrTest('test_codecmaps_cn.py', usemodules='_multibytecodec'), + RegrTest('test_codecmaps_hk.py', usemodules='_multibytecodec'), + RegrTest('test_codecmaps_jp.py', usemodules='_multibytecodec'), + RegrTest('test_codecmaps_kr.py', usemodules='_multibytecodec'), + RegrTest('test_codecmaps_tw.py', usemodules='_multibytecodec'), + RegrTest('test_codecs.py', core=True, usemodules='_multibytecodec'), RegrTest('test_codeop.py', core=True), RegrTest('test_coercion.py', core=True), RegrTest('test_collections.py'), @@ -314,7 +314,7 @@ testmap = [ RegrTest('test_mmap.py'), RegrTest('test_module.py', core=True), RegrTest('test_modulefinder.py'), - RegrTest('test_multibytecodec.py'), + RegrTest('test_multibytecodec.py', usemodules='_multibytecodec'), RegrTest('test_multibytecodec_support.py', skip="not a test"), RegrTest('test_multifile.py'), RegrTest('test_multiprocessing.py', skip='FIXME leaves subprocesses'), diff --git a/pypy/module/_multibytecodec/__init__.py b/pypy/module/_multibytecodec/__init__.py index acf5dde8d4..59687a991c 100644 --- a/pypy/module/_multibytecodec/__init__.py +++ b/pypy/module/_multibytecodec/__init__.py @@ -7,13 +7,14 @@ class Module(MixedModule): # for compatibility this name is obscured, and should be called # via the _codecs_*.py modules written in lib_pypy. '__getcodec': 'interp_multibytecodec.getcodec', + + 'MultibyteIncrementalDecoder': + 'interp_incremental.MultibyteIncrementalDecoder', + 'MultibyteIncrementalEncoder': + 'interp_incremental.MultibyteIncrementalEncoder', } appleveldefs = { - 'MultibyteIncrementalEncoder': - 'app_multibytecodec.MultibyteIncrementalEncoder', - 'MultibyteIncrementalDecoder': - 'app_multibytecodec.MultibyteIncrementalDecoder', 'MultibyteStreamReader': 'app_multibytecodec.MultibyteStreamReader', 'MultibyteStreamWriter': diff --git a/pypy/module/_multibytecodec/app_multibytecodec.py b/pypy/module/_multibytecodec/app_multibytecodec.py index 1128139ad7..b0cd4310d5 100644 --- a/pypy/module/_multibytecodec/app_multibytecodec.py +++ b/pypy/module/_multibytecodec/app_multibytecodec.py @@ -1,34 +1,49 @@ # NOT_RPYTHON # -# These classes are not supported so far. -# -# My theory is that they are not widely used on CPython either, because -# I found two bugs just by looking at their .c source: they always call -# encreset() after a piece of data, even though I think it's wrong --- -# it should be called only once at the end; and mbiencoder_reset() calls -# decreset() instead of encreset(). -# +# The interface here may be a little bit on the lightweight side. + +from _multibytecodec import MultibyteIncrementalDecoder +from _multibytecodec import MultibyteIncrementalEncoder + + +class MultibyteStreamReader(MultibyteIncrementalDecoder): + def __new__(cls, stream, errors=None): + self = MultibyteIncrementalDecoder.__new__(cls, errors) + self.stream = stream + return self + + def __read(self, read, size): + while True: + if size is None: + data = read() + final = True + else: + data = read(size) + final = not data + output = MultibyteIncrementalDecoder.decode(self, data, final) + if output or final: + return output + size = 1 # read 1 more byte and retry + + def read(self, size=None): + return self.__read(self.stream.read, size) + + def readline(self, size=None): + return self.__read(self.stream.readline, size) + + def readlines(self, sizehint=None): + return self.__read(self.stream.read, sizehint).splitlines(True) + + +class MultibyteStreamWriter(MultibyteIncrementalEncoder): + def __new__(cls, stream, errors=None): + self = MultibyteIncrementalEncoder.__new__(cls, errors) + self.stream = stream + return self + + def write(self, data): + self.stream.write(MultibyteIncrementalEncoder.encode(self, data, True)) -class MultibyteIncrementalEncoder(object): - def __init__(self, *args, **kwds): - raise LookupError( - "MultibyteIncrementalEncoder not implemented; " - "see pypy/module/_multibytecodec/app_multibytecodec.py") - -class MultibyteIncrementalDecoder(object): - def __init__(self, *args, **kwds): - raise LookupError( - "MultibyteIncrementalDecoder not implemented; " - "see pypy/module/_multibytecodec/app_multibytecodec.py") - -class MultibyteStreamReader(object): - def __init__(self, *args, **kwds): - raise LookupError( - "MultibyteStreamReader not implemented; " - "see pypy/module/_multibytecodec/app_multibytecodec.py") - -class MultibyteStreamWriter(object): - def __init__(self, *args, **kwds): - raise LookupError( - "MultibyteStreamWriter not implemented; " - "see pypy/module/_multibytecodec/app_multibytecodec.py") + def writelines(self, lines): + for data in lines: + self.write(data) diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py index 6756d97dc7..d0ab9a532b 100644 --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -52,11 +52,13 @@ eci = ExternalCompilationInfo( includes = ['src/cjkcodecs/multibytecodec.h'], include_dirs = [str(srcdir)], export_symbols = [ + "pypy_cjk_dec_new", "pypy_cjk_dec_init", "pypy_cjk_dec_free", "pypy_cjk_dec_chunk", "pypy_cjk_dec_outbuf", "pypy_cjk_dec_outlen", "pypy_cjk_dec_inbuf_remaining", "pypy_cjk_dec_inbuf_consumed", "pypy_cjk_dec_replace_on_error", + "pypy_cjk_enc_new", "pypy_cjk_enc_init", "pypy_cjk_enc_free", "pypy_cjk_enc_chunk", "pypy_cjk_enc_reset", "pypy_cjk_enc_outbuf", "pypy_cjk_enc_outlen", "pypy_cjk_enc_inbuf_remaining", "pypy_cjk_enc_inbuf_consumed", @@ -92,9 +94,11 @@ def getcodec(name): # Decoding DECODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_dec_s', compilation_info=eci) +pypy_cjk_dec_new = llexternal('pypy_cjk_dec_new', + [MULTIBYTECODEC_P], DECODEBUF_P) pypy_cjk_dec_init = llexternal('pypy_cjk_dec_init', - [MULTIBYTECODEC_P, rffi.CCHARP, rffi.SSIZE_T], - DECODEBUF_P) + [DECODEBUF_P, rffi.CCHARP, rffi.SSIZE_T], + rffi.SSIZE_T) pypy_cjk_dec_free = llexternal('pypy_cjk_dec_free', [DECODEBUF_P], lltype.Void) pypy_cjk_dec_chunk = llexternal('pypy_cjk_dec_chunk', [DECODEBUF_P], @@ -113,25 +117,30 @@ pypy_cjk_dec_replace_on_error = llexternal('pypy_cjk_dec_replace_on_error', rffi.SSIZE_T) def decode(codec, stringdata, errors="strict", errorcb=None, namecb=None): + decodebuf = pypy_cjk_dec_new(codec) + if not decodebuf: + raise MemoryError + try: + return decodeex(decodebuf, stringdata, errors, errorcb, namecb) + finally: + pypy_cjk_dec_free(decodebuf) + +def decodeex(decodebuf, stringdata, errors="strict", errorcb=None, namecb=None, + ignore_error=0): inleft = len(stringdata) inbuf = rffi.get_nonmovingbuffer(stringdata) try: - decodebuf = pypy_cjk_dec_init(codec, inbuf, inleft) - if not decodebuf: + if pypy_cjk_dec_init(decodebuf, inbuf, inleft) < 0: raise MemoryError - try: - while True: - r = pypy_cjk_dec_chunk(decodebuf) - if r == 0: - break - multibytecodec_decerror(decodebuf, r, errors, - errorcb, namecb, stringdata) - src = pypy_cjk_dec_outbuf(decodebuf) - length = pypy_cjk_dec_outlen(decodebuf) - return rffi.wcharpsize2unicode(src, length) - # - finally: - pypy_cjk_dec_free(decodebuf) + while True: + r = pypy_cjk_dec_chunk(decodebuf) + if r == 0 or r == ignore_error: + break + multibytecodec_decerror(decodebuf, r, errors, + errorcb, namecb, stringdata) + src = pypy_cjk_dec_outbuf(decodebuf) + length = pypy_cjk_dec_outlen(decodebuf) + return rffi.wcharpsize2unicode(src, length) # finally: rffi.free_nonmovingbuffer(stringdata, inbuf) @@ -174,9 +183,11 @@ def multibytecodec_decerror(decodebuf, e, errors, # ____________________________________________________________ # Encoding ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci) +pypy_cjk_enc_new = llexternal('pypy_cjk_enc_new', + [MULTIBYTECODEC_P], ENCODEBUF_P) pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init', - [MULTIBYTECODEC_P, rffi.CWCHARP, rffi.SSIZE_T], - ENCODEBUF_P) + [ENCODEBUF_P, rffi.CWCHARP, rffi.SSIZE_T], + rffi.SSIZE_T) pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P], lltype.Void) pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk', [ENCODEBUF_P], @@ -195,39 +206,46 @@ pypy_cjk_enc_replace_on_error = llexternal('pypy_cjk_enc_replace_on_error', [ENCODEBUF_P, rffi.CCHARP, rffi.SSIZE_T, rffi.SSIZE_T], rffi.SSIZE_T) +pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec', + [ENCODEBUF_P], MULTIBYTECODEC_P) def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None): + encodebuf = pypy_cjk_enc_new(codec) + if not encodebuf: + raise MemoryError + try: + return encodeex(encodebuf, unicodedata, errors, errorcb, namecb) + finally: + pypy_cjk_enc_free(encodebuf) + +def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None, + namecb=None, ignore_error=0): inleft = len(unicodedata) inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata) try: - encodebuf = pypy_cjk_enc_init(codec, inbuf, inleft) - if not encodebuf: + if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0: raise MemoryError - try: - while True: - r = pypy_cjk_enc_chunk(encodebuf) - if r == 0: - break - multibytecodec_encerror(encodebuf, r, errors, - codec, errorcb, namecb, unicodedata) - while True: - r = pypy_cjk_enc_reset(encodebuf) - if r == 0: - break - multibytecodec_encerror(encodebuf, r, errors, - codec, errorcb, namecb, unicodedata) - src = pypy_cjk_enc_outbuf(encodebuf) - length = pypy_cjk_enc_outlen(encodebuf) - return rffi.charpsize2str(src, length) - # - finally: - pypy_cjk_enc_free(encodebuf) + while True: + r = pypy_cjk_enc_chunk(encodebuf) + if r == 0 or r == ignore_error: + break + multibytecodec_encerror(encodebuf, r, errors, + errorcb, namecb, unicodedata) + while True: + r = pypy_cjk_enc_reset(encodebuf) + if r == 0: + break + multibytecodec_encerror(encodebuf, r, errors, + errorcb, namecb, unicodedata) + src = pypy_cjk_enc_outbuf(encodebuf) + length = pypy_cjk_enc_outlen(encodebuf) + return rffi.charpsize2str(src, length) # finally: rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf) def multibytecodec_encerror(encodebuf, e, errors, - codec, errorcb, namecb, unicodedata): + errorcb, namecb, unicodedata): if e > 0: reason = "illegal multibyte sequence" esize = e @@ -248,6 +266,7 @@ def multibytecodec_encerror(encodebuf, e, errors, elif errors == "ignore": replace = "" elif errors == "replace": + codec = pypy_cjk_enc_getcodec(encodebuf) try: replace = encode(codec, u"?") except EncodeDecodeError: diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py new file mode 100644 index 0000000000..d83e4bf660 --- /dev/null +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -0,0 +1,141 @@ +from pypy.rpython.lltypesystem import lltype +from pypy.module._multibytecodec import c_codecs +from pypy.module._multibytecodec.interp_multibytecodec import ( + MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror, + wrap_unicodeencodeerror) +from pypy.interpreter.baseobjspace import Wrappable +from pypy.interpreter.gateway import interp2app, unwrap_spec +from pypy.interpreter.typedef import TypeDef, GetSetProperty +from pypy.module._codecs.interp_codecs import CodecState + + +class MultibyteIncrementalBase(Wrappable): + + def __init__(self, space, errors): + if errors is None: + errors = 'strict' + self.space = space + self.errors = errors + w_codec = space.getattr(space.wrap(self), space.wrap("codec")) + codec = space.interp_w(MultibyteCodec, w_codec) + self.codec = codec.codec + self.name = codec.name + self._initialize() + + def __del__(self): + self._free() + + def reset_w(self): + self._free() + self._initialize() + + def fget_errors(self, space): + return space.wrap(self.errors) + + def fset_errors(self, space, w_errors): + self.errors = space.str_w(w_errors) + + +class MultibyteIncrementalDecoder(MultibyteIncrementalBase): + + def _initialize(self): + self.decodebuf = c_codecs.pypy_cjk_dec_new(self.codec) + self.pending = "" + + def _free(self): + self.pending = None + if self.decodebuf: + c_codecs.pypy_cjk_dec_free(self.decodebuf) + self.decodebuf = lltype.nullptr(c_codecs.DECODEBUF_P.TO) + + @unwrap_spec(object=str, final=bool) + def decode_w(self, object, final=False): + space = self.space + state = space.fromcache(CodecState) + if len(self.pending) > 0: + object = self.pending + object + try: + output = c_codecs.decodeex(self.decodebuf, object, self.errors, + state.decode_error_handler, self.name, + get_ignore_error(final)) + except c_codecs.EncodeDecodeError, e: + raise wrap_unicodedecodeerror(space, e, object, self.name) + except RuntimeError: + raise wrap_runtimeerror(space) + pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) + assert 0 <= pos <= len(object) + self.pending = object[pos:] + return space.wrap(output) + + +@unwrap_spec(errors="str_or_None") +def mbidecoder_new(space, w_subtype, errors=None): + r = space.allocate_instance(MultibyteIncrementalDecoder, w_subtype) + r.__init__(space, errors) + return space.wrap(r) + +MultibyteIncrementalDecoder.typedef = TypeDef( + 'MultibyteIncrementalDecoder', + __module__ = '_multibytecodec', + __new__ = interp2app(mbidecoder_new), + decode = interp2app(MultibyteIncrementalDecoder.decode_w), + reset = interp2app(MultibyteIncrementalDecoder.reset_w), + errors = GetSetProperty(MultibyteIncrementalDecoder.fget_errors, + MultibyteIncrementalDecoder.fset_errors), + ) + + +class MultibyteIncrementalEncoder(MultibyteIncrementalBase): + + def _initialize(self): + self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec) + self.pending = u"" + + def _free(self): + self.pending = None + if self.encodebuf: + c_codecs.pypy_cjk_enc_free(self.encodebuf) + self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO) + + @unwrap_spec(object=unicode, final=bool) + def encode_w(self, object, final=False): + space = self.space + state = space.fromcache(CodecState) + if len(self.pending) > 0: + object = self.pending + object + try: + output = c_codecs.encodeex(self.encodebuf, object, self.errors, + state.encode_error_handler, self.name, + get_ignore_error(final)) + except c_codecs.EncodeDecodeError, e: + raise wrap_unicodeencodeerror(space, e, object, self.name) + except RuntimeError: + raise wrap_runtimeerror(space) + pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf) + assert 0 <= pos <= len(object) + self.pending = object[pos:] + return space.wrap(output) + + +@unwrap_spec(errors="str_or_None") +def mbiencoder_new(space, w_subtype, errors=None): + r = space.allocate_instance(MultibyteIncrementalEncoder, w_subtype) + r.__init__(space, errors) + return space.wrap(r) + +MultibyteIncrementalEncoder.typedef = TypeDef( + 'MultibyteIncrementalEncoder', + __module__ = '_multibytecodec', + __new__ = interp2app(mbiencoder_new), + encode = interp2app(MultibyteIncrementalEncoder.encode_w), + reset = interp2app(MultibyteIncrementalEncoder.reset_w), + errors = GetSetProperty(MultibyteIncrementalEncoder.fget_errors, + MultibyteIncrementalEncoder.fset_errors), + ) + + +def get_ignore_error(final): + if final: + return 0 # don't ignore any error + else: + return c_codecs.MBERR_TOOFEW diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py index 6ffb9a8fa6..46b540c139 100644 --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -22,17 +22,9 @@ class MultibyteCodec(Wrappable): output = c_codecs.decode(self.codec, input, errors, state.decode_error_handler, self.name) except c_codecs.EncodeDecodeError, e: - raise OperationError( - space.w_UnicodeDecodeError, - space.newtuple([ - space.wrap(self.name), - space.wrap(input), - space.wrap(e.start), - space.wrap(e.end), - space.wrap(e.reason)])) + raise wrap_unicodedecodeerror(space, e, input, self.name) except RuntimeError: - raise OperationError(space.w_RuntimeError, - space.wrap("internal codec error")) + raise wrap_runtimeerror(space) return space.newtuple([space.wrap(output), space.wrap(len(input))]) @@ -46,17 +38,9 @@ class MultibyteCodec(Wrappable): output = c_codecs.encode(self.codec, input, errors, state.encode_error_handler, self.name) except c_codecs.EncodeDecodeError, e: - raise OperationError( - space.w_UnicodeEncodeError, - space.newtuple([ - space.wrap(self.name), - space.wrap(input), - space.wrap(e.start), - space.wrap(e.end), - space.wrap(e.reason)])) + raise wrap_unicodeencodeerror(space, e, input, self.name) except RuntimeError: - raise OperationError(space.w_RuntimeError, - space.wrap("internal codec error")) + raise wrap_runtimeerror(space) return space.newtuple([space.wrap(output), space.wrap(len(input))]) @@ -78,3 +62,28 @@ def getcodec(space, name): raise OperationError(space.w_LookupError, space.wrap("no such codec is supported.")) return space.wrap(MultibyteCodec(name, codec)) + + +def wrap_unicodedecodeerror(space, e, input, name): + return OperationError( + space.w_UnicodeDecodeError, + space.newtuple([ + space.wrap(name), + space.wrap(input), + space.wrap(e.start), + space.wrap(e.end), + space.wrap(e.reason)])) + +def wrap_unicodeencodeerror(space, e, input, name): + raise OperationError( + space.w_UnicodeEncodeError, + space.newtuple([ + space.wrap(name), + space.wrap(input), + space.wrap(e.start), + space.wrap(e.end), + space.wrap(e.reason)])) + +def wrap_runtimeerror(space): + raise OperationError(space.w_RuntimeError, + space.wrap("internal codec error")) diff --git a/pypy/module/_multibytecodec/test/test_app_incremental.py b/pypy/module/_multibytecodec/test/test_app_incremental.py new file mode 100644 index 0000000000..7fd96eccdd --- /dev/null +++ b/pypy/module/_multibytecodec/test/test_app_incremental.py @@ -0,0 +1,138 @@ +from pypy.conftest import gettestobjspace + + +class AppTestClasses: + def setup_class(cls): + cls.space = gettestobjspace(usemodules=['_multibytecodec']) + cls.w_IncrementalHzDecoder = cls.space.appexec([], """(): + import _codecs_cn + from _multibytecodec import MultibyteIncrementalDecoder + + class IncrementalHzDecoder(MultibyteIncrementalDecoder): + codec = _codecs_cn.getcodec('hz') + + return IncrementalHzDecoder + """) + cls.w_IncrementalHzEncoder = cls.space.appexec([], """(): + import _codecs_cn + from _multibytecodec import MultibyteIncrementalEncoder + + class IncrementalHzEncoder(MultibyteIncrementalEncoder): + codec = _codecs_cn.getcodec('hz') + + return IncrementalHzEncoder + """) + + def test_decode_hz(self): + d = self.IncrementalHzDecoder() + r = d.decode("~{abcd~}") + assert r == u'\u5f95\u6c85' + r = d.decode("~{efgh~}") + assert r == u'\u5f50\u73b7' + for c, output in zip("!~{abcd~}xyz~{efgh", + [u'!', # ! + u'', # ~ + u'', # { + u'', # a + u'\u5f95', # b + u'', # c + u'\u6c85', # d + u'', # ~ + u'', # } + u'x', # x + u'y', # y + u'z', # z + u'', # ~ + u'', # { + u'', # e + u'\u5f50', # f + u'', # g + u'\u73b7', # h + ]): + r = d.decode(c) + assert r == output + + def test_decode_hz_final(self): + d = self.IncrementalHzDecoder() + r = d.decode("~{", True) + assert r == u'' + raises(UnicodeDecodeError, d.decode, "~", True) + raises(UnicodeDecodeError, d.decode, "~{a", True) + + def test_decode_hz_reset(self): + d = self.IncrementalHzDecoder() + r = d.decode("ab") + assert r == u'ab' + r = d.decode("~{") + assert r == u'' + r = d.decode("ab") + assert r == u'\u5f95' + r = d.decode("ab") + assert r == u'\u5f95' + d.reset() + r = d.decode("ab") + assert r == u'ab' + + def test_decode_hz_error(self): + d = self.IncrementalHzDecoder() + raises(UnicodeDecodeError, d.decode, "~{abc", True) + d = self.IncrementalHzDecoder("ignore") + r = d.decode("~{abc", True) + assert r == u'\u5f95' + d = self.IncrementalHzDecoder() + d.errors = "replace" + r = d.decode("~{abc", True) + assert r == u'\u5f95\ufffd' + + def test_decode_hz_buffer_grow(self): + d = self.IncrementalHzDecoder() + for i in range(13): + r = d.decode("a" * (2**i)) + assert r == u"a" * (2**i) + + def test_encode_hz(self): + e = self.IncrementalHzEncoder() + r = e.encode("abcd") + assert r == 'abcd' + r = e.encode(u"\u5f95\u6c85") + assert r == '~{abcd~}' + r = e.encode(u"\u5f50") + assert r == '~{ef~}' + r = e.encode(u"\u73b7") + assert r == '~{gh~}' + + def test_encode_hz_final(self): + e = self.IncrementalHzEncoder() + r = e.encode(u"xyz\u5f95\u6c85", True) + assert r == 'xyz~{abcd~}' + # This is a bit hard to test, because the only way I can see that + # encoders can return MBERR_TOOFEW is with surrogates, which only + # occur with 2-byte unicode characters... We will just have to + # trust that the logic works, because it is exactly the same one + # as in the decode case :-/ + + def test_encode_hz_reset(self): + # Same issue as with test_encode_hz_final + e = self.IncrementalHzEncoder() + r = e.encode(u"xyz\u5f95\u6c85", True) + assert r == 'xyz~{abcd~}' + e.reset() + r = e.encode(u"xyz\u5f95\u6c85") + assert r == 'xyz~{abcd~}' + + def test_encode_hz_error(self): + e = self.IncrementalHzEncoder() + raises(UnicodeEncodeError, e.encode, u"\u4321", True) + e = self.IncrementalHzEncoder("ignore") + r = e.encode(u"xy\u4321z", True) + assert r == 'xyz' + e = self.IncrementalHzEncoder() + e.errors = "replace" + r = e.encode(u"xy\u4321z", True) + assert r == 'xy?z' + + def test_encode_hz_buffer_grow(self): + e = self.IncrementalHzEncoder() + for i in range(13): + r = e.encode(u"a" * (2**i)) + assert r == "a" * (2**i) diff --git a/pypy/module/_multibytecodec/test/test_app_stream.py b/pypy/module/_multibytecodec/test/test_app_stream.py new file mode 100644 index 0000000000..253c6ce66f --- /dev/null +++ b/pypy/module/_multibytecodec/test/test_app_stream.py @@ -0,0 +1,71 @@ +from pypy.conftest import gettestobjspace + + +class AppTestStreams: + def setup_class(cls): + cls.space = gettestobjspace(usemodules=['_multibytecodec']) + cls.w_HzStreamReader = cls.space.appexec([], """(): + import _codecs_cn + from _multibytecodec import MultibyteStreamReader + + class HzStreamReader(MultibyteStreamReader): + codec = _codecs_cn.getcodec('hz') + + return HzStreamReader + """) + cls.w_HzStreamWriter = cls.space.appexec([], """(): + import _codecs_cn + from _multibytecodec import MultibyteStreamWriter + + class HzStreamWriter(MultibyteStreamWriter): + codec = _codecs_cn.getcodec('hz') + + return HzStreamWriter + """) + + def test_reader(self): + class FakeFile: + def __init__(self, data): + self.data = data + self.pos = 0 + def read(self, size): + res = self.data[self.pos : self.pos + size] + self.pos += size + return res + # + r = self.HzStreamReader(FakeFile("!~{abcd~}xyz~{efgh")) + for expected in u'!\u5f95\u6c85xyz\u5f50\u73b7': + c = r.read(1) + assert c == expected + c = r.read(1) + assert c == '' + + def test_reader_replace(self): + class FakeFile: + def __init__(self, data): + self.data = data + def read(self): + return self.data + # + r = self.HzStreamReader(FakeFile("!~{a"), "replace") + c = r.read() + assert c == u'!\ufffd' + # + r = self.HzStreamReader(FakeFile("!~{a")) + r.errors = "replace" + assert r.errors == "replace" + c = r.read() + assert c == u'!\ufffd' + + def test_writer(self): + class FakeFile: + def __init__(self): + self.output = [] + def write(self, data): + self.output.append(data) + # + w = self.HzStreamWriter(FakeFile()) + for input in u'!\u5f95\u6c85xyz\u5f50\u73b7': + w.write(input) + assert w.stream.output == ['!', '~{ab~}', '~{cd~}', 'x', 'y', 'z', + '~{ef~}', '~{gh~}'] diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py index e71668faa3..359241a377 100644 --- a/pypy/module/_multibytecodec/test/test_c_codecs.py +++ b/pypy/module/_multibytecodec/test/test_c_codecs.py @@ -2,6 +2,7 @@ import py from pypy.module._multibytecodec.c_codecs import getcodec, codecs from pypy.module._multibytecodec.c_codecs import decode, encode from pypy.module._multibytecodec.c_codecs import EncodeDecodeError +from pypy.module._multibytecodec import c_codecs def test_codecs_existence(): @@ -22,6 +23,52 @@ def test_decode_hz(): c = getcodec("hz") u = decode(c, "~{abc}") assert u == u'\u5f95\u6cef' + u = decode(c, "~{") + assert u == u'' + +def test_decodeex_hz(): + c = getcodec("hz") + decodebuf = c_codecs.pypy_cjk_dec_new(c) + u = c_codecs.decodeex(decodebuf, "~{abcd~}") + assert u == u'\u5f95\u6c85' + u = c_codecs.decodeex(decodebuf, "~{efgh~}") + assert u == u'\u5f50\u73b7' + u = c_codecs.decodeex(decodebuf, "!~{abcd~}xyz~{efgh") + assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7' + c_codecs.pypy_cjk_dec_free(decodebuf) + +def test_decodeex_hz_incomplete(): + c = getcodec("hz") + decodebuf = c_codecs.pypy_cjk_dec_new(c) + buf = '' + for c, output in zip("!~{abcd~}xyz~{efgh", + [u'!', # ! + u'', # ~ + u'', # { + u'', # a + u'\u5f95', # b + u'', # c + u'\u6c85', # d + u'', # ~ + u'', # } + u'x', # x + u'y', # y + u'z', # z + u'', # ~ + u'', # { + u'', # e + u'\u5f50', # f + u'', # g + u'\u73b7', # h + ]): + buf += c + u = c_codecs.decodeex(decodebuf, buf, + ignore_error = c_codecs.MBERR_TOOFEW) + assert u == output + incompletepos = c_codecs.pypy_cjk_dec_inbuf_consumed(decodebuf) + buf = buf[incompletepos:] + assert buf == '' + c_codecs.pypy_cjk_dec_free(decodebuf) def test_decode_hz_error(): # error diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c index 8625bc04f5..f4d1e416e3 100644 --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c @@ -3,31 +3,38 @@ #include "src/cjkcodecs/multibytecodec.h" -struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec, - char *inbuf, Py_ssize_t inlen) +struct pypy_cjk_dec_s *pypy_cjk_dec_new(const MultibyteCodec *codec) { struct pypy_cjk_dec_s *d = malloc(sizeof(struct pypy_cjk_dec_s)); if (!d) return NULL; if (codec->decinit != NULL && codec->decinit(&d->state, codec->config) != 0) - goto errorexit; - + { + free(d); + return NULL; + } d->codec = codec; + d->outbuf_start = NULL; + return d; +} + +Py_ssize_t pypy_cjk_dec_init(struct pypy_cjk_dec_s *d, + char *inbuf, Py_ssize_t inlen) +{ d->inbuf_start = inbuf; d->inbuf = inbuf; d->inbuf_end = inbuf + inlen; - d->outbuf_start = (inlen <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) ? - malloc(inlen * sizeof(Py_UNICODE)) : - NULL); - if (!d->outbuf_start) - goto errorexit; + if (d->outbuf_start == NULL) + { + d->outbuf_start = (inlen <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) ? + malloc(inlen * sizeof(Py_UNICODE)) : + NULL); + if (d->outbuf_start == NULL) + return -1; + d->outbuf_end = d->outbuf_start + inlen; + } d->outbuf = d->outbuf_start; - d->outbuf_end = d->outbuf_start + inlen; - return d; - - errorexit: - free(d); - return NULL; + return 0; } void pypy_cjk_dec_free(struct pypy_cjk_dec_s *d) @@ -112,34 +119,40 @@ Py_ssize_t pypy_cjk_dec_replace_on_error(struct pypy_cjk_dec_s* d, /************************************************************/ -struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec, - Py_UNICODE *inbuf, Py_ssize_t inlen) +struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec) { - Py_ssize_t outlen; struct pypy_cjk_enc_s *d = malloc(sizeof(struct pypy_cjk_enc_s)); if (!d) return NULL; if (codec->encinit != NULL && codec->encinit(&d->state, codec->config) != 0) - goto errorexit; - + { + free(d); + return NULL; + } d->codec = codec; + d->outbuf_start = NULL; + return d; +} + +Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d, + Py_UNICODE *inbuf, Py_ssize_t inlen) +{ + Py_ssize_t outlen; d->inbuf_start = inbuf; d->inbuf = inbuf; d->inbuf_end = inbuf + inlen; - - if (inlen > (PY_SSIZE_T_MAX - 16) / 2) - goto errorexit; - outlen = inlen * 2 + 16; - d->outbuf_start = malloc(outlen); - if (!d->outbuf_start) - goto errorexit; + if (d->outbuf_start == NULL) + { + if (inlen > (PY_SSIZE_T_MAX - 16) / 2) + return -1; + outlen = inlen * 2 + 16; + d->outbuf_start = malloc(outlen); + if (d->outbuf_start == NULL) + return -1; + d->outbuf_end = d->outbuf_start + outlen; + } d->outbuf = d->outbuf_start; - d->outbuf_end = d->outbuf_start + outlen; - return d; - - errorexit: - free(d); - return NULL; + return 0; } void pypy_cjk_enc_free(struct pypy_cjk_enc_s *d) @@ -242,3 +255,8 @@ Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d, d->inbuf = d->inbuf_start + in_offset; return 0; } + +const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *d) +{ + return d->codec; +} diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h index 8f59d14c8d..6d79acda43 100644 --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h @@ -94,8 +94,9 @@ struct pypy_cjk_dec_s { Py_UNICODE *outbuf_start, *outbuf, *outbuf_end; }; -struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec, - char *inbuf, Py_ssize_t inlen); +struct pypy_cjk_dec_s *pypy_cjk_dec_new(const MultibyteCodec *codec); +Py_ssize_t pypy_cjk_dec_init(struct pypy_cjk_dec_s *d, + char *inbuf, Py_ssize_t inlen); void pypy_cjk_dec_free(struct pypy_cjk_dec_s *); Py_ssize_t pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *); Py_UNICODE *pypy_cjk_dec_outbuf(struct pypy_cjk_dec_s *); @@ -112,8 +113,9 @@ struct pypy_cjk_enc_s { unsigned char *outbuf_start, *outbuf, *outbuf_end; }; -struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec, - Py_UNICODE *inbuf, Py_ssize_t inlen); +struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec); +Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d, + Py_UNICODE *inbuf, Py_ssize_t inlen); void pypy_cjk_enc_free(struct pypy_cjk_enc_s *); Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *); Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *); @@ -123,6 +125,7 @@ Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d); Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d); Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d, char *, Py_ssize_t, Py_ssize_t); +const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *); /* list of codecs defined in the .c files */ |