aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArmin Rigo <arigo@tunes.org>2011-08-01 16:18:56 +0200
committerArmin Rigo <arigo@tunes.org>2011-08-01 16:18:56 +0200
commit33e3ca835f82f07dc427efd75bc9bc39e83919a8 (patch)
tree1f60fe56cbdb96f5329a6e7cca6e4ee20b301840
parentAdd an explicit flag 'add_memory_pressure=True' to the (diff)
parentAdd missing 'usemodules'. (diff)
downloadpypy-33e3ca835f82f07dc427efd75bc9bc39e83919a8.tar.gz
pypy-33e3ca835f82f07dc427efd75bc9bc39e83919a8.tar.bz2
pypy-33e3ca835f82f07dc427efd75bc9bc39e83919a8.zip
merge heads
-rw-r--r--lib-python/conftest.py26
-rw-r--r--pypy/module/_multibytecodec/__init__.py9
-rw-r--r--pypy/module/_multibytecodec/app_multibytecodec.py77
-rw-r--r--pypy/module/_multibytecodec/c_codecs.py101
-rw-r--r--pypy/module/_multibytecodec/interp_incremental.py141
-rw-r--r--pypy/module/_multibytecodec/interp_multibytecodec.py49
-rw-r--r--pypy/module/_multibytecodec/test/test_app_incremental.py138
-rw-r--r--pypy/module/_multibytecodec/test/test_app_stream.py71
-rw-r--r--pypy/module/_multibytecodec/test/test_c_codecs.py47
-rw-r--r--pypy/translator/c/src/cjkcodecs/multibytecodec.c84
-rw-r--r--pypy/translator/c/src/cjkcodecs/multibytecodec.h11
11 files changed, 608 insertions, 146 deletions
diff --git a/lib-python/conftest.py b/lib-python/conftest.py
index 09d107d622..84b15d6135 100644
--- a/lib-python/conftest.py
+++ b/lib-python/conftest.py
@@ -154,18 +154,18 @@ testmap = [
RegrTest('test_cmd.py'),
RegrTest('test_cmd_line_script.py'),
RegrTest('test_codeccallbacks.py', core=True),
- RegrTest('test_codecencodings_cn.py'),
- RegrTest('test_codecencodings_hk.py'),
- RegrTest('test_codecencodings_jp.py'),
- RegrTest('test_codecencodings_kr.py'),
- RegrTest('test_codecencodings_tw.py'),
-
- RegrTest('test_codecmaps_cn.py'),
- RegrTest('test_codecmaps_hk.py'),
- RegrTest('test_codecmaps_jp.py'),
- RegrTest('test_codecmaps_kr.py'),
- RegrTest('test_codecmaps_tw.py'),
- RegrTest('test_codecs.py', core=True),
+ RegrTest('test_codecencodings_cn.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecencodings_hk.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecencodings_jp.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecencodings_kr.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecencodings_tw.py', usemodules='_multibytecodec'),
+
+ RegrTest('test_codecmaps_cn.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecmaps_hk.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecmaps_jp.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecmaps_kr.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecmaps_tw.py', usemodules='_multibytecodec'),
+ RegrTest('test_codecs.py', core=True, usemodules='_multibytecodec'),
RegrTest('test_codeop.py', core=True),
RegrTest('test_coercion.py', core=True),
RegrTest('test_collections.py'),
@@ -314,7 +314,7 @@ testmap = [
RegrTest('test_mmap.py'),
RegrTest('test_module.py', core=True),
RegrTest('test_modulefinder.py'),
- RegrTest('test_multibytecodec.py'),
+ RegrTest('test_multibytecodec.py', usemodules='_multibytecodec'),
RegrTest('test_multibytecodec_support.py', skip="not a test"),
RegrTest('test_multifile.py'),
RegrTest('test_multiprocessing.py', skip='FIXME leaves subprocesses'),
diff --git a/pypy/module/_multibytecodec/__init__.py b/pypy/module/_multibytecodec/__init__.py
index acf5dde8d4..59687a991c 100644
--- a/pypy/module/_multibytecodec/__init__.py
+++ b/pypy/module/_multibytecodec/__init__.py
@@ -7,13 +7,14 @@ class Module(MixedModule):
# for compatibility this name is obscured, and should be called
# via the _codecs_*.py modules written in lib_pypy.
'__getcodec': 'interp_multibytecodec.getcodec',
+
+ 'MultibyteIncrementalDecoder':
+ 'interp_incremental.MultibyteIncrementalDecoder',
+ 'MultibyteIncrementalEncoder':
+ 'interp_incremental.MultibyteIncrementalEncoder',
}
appleveldefs = {
- 'MultibyteIncrementalEncoder':
- 'app_multibytecodec.MultibyteIncrementalEncoder',
- 'MultibyteIncrementalDecoder':
- 'app_multibytecodec.MultibyteIncrementalDecoder',
'MultibyteStreamReader':
'app_multibytecodec.MultibyteStreamReader',
'MultibyteStreamWriter':
diff --git a/pypy/module/_multibytecodec/app_multibytecodec.py b/pypy/module/_multibytecodec/app_multibytecodec.py
index 1128139ad7..b0cd4310d5 100644
--- a/pypy/module/_multibytecodec/app_multibytecodec.py
+++ b/pypy/module/_multibytecodec/app_multibytecodec.py
@@ -1,34 +1,49 @@
# NOT_RPYTHON
#
-# These classes are not supported so far.
-#
-# My theory is that they are not widely used on CPython either, because
-# I found two bugs just by looking at their .c source: they always call
-# encreset() after a piece of data, even though I think it's wrong ---
-# it should be called only once at the end; and mbiencoder_reset() calls
-# decreset() instead of encreset().
-#
+# The interface here may be a little bit on the lightweight side.
+
+from _multibytecodec import MultibyteIncrementalDecoder
+from _multibytecodec import MultibyteIncrementalEncoder
+
+
+class MultibyteStreamReader(MultibyteIncrementalDecoder):
+ def __new__(cls, stream, errors=None):
+ self = MultibyteIncrementalDecoder.__new__(cls, errors)
+ self.stream = stream
+ return self
+
+ def __read(self, read, size):
+ while True:
+ if size is None:
+ data = read()
+ final = True
+ else:
+ data = read(size)
+ final = not data
+ output = MultibyteIncrementalDecoder.decode(self, data, final)
+ if output or final:
+ return output
+ size = 1 # read 1 more byte and retry
+
+ def read(self, size=None):
+ return self.__read(self.stream.read, size)
+
+ def readline(self, size=None):
+ return self.__read(self.stream.readline, size)
+
+ def readlines(self, sizehint=None):
+ return self.__read(self.stream.read, sizehint).splitlines(True)
+
+
+class MultibyteStreamWriter(MultibyteIncrementalEncoder):
+ def __new__(cls, stream, errors=None):
+ self = MultibyteIncrementalEncoder.__new__(cls, errors)
+ self.stream = stream
+ return self
+
+ def write(self, data):
+ self.stream.write(MultibyteIncrementalEncoder.encode(self, data, True))
-class MultibyteIncrementalEncoder(object):
- def __init__(self, *args, **kwds):
- raise LookupError(
- "MultibyteIncrementalEncoder not implemented; "
- "see pypy/module/_multibytecodec/app_multibytecodec.py")
-
-class MultibyteIncrementalDecoder(object):
- def __init__(self, *args, **kwds):
- raise LookupError(
- "MultibyteIncrementalDecoder not implemented; "
- "see pypy/module/_multibytecodec/app_multibytecodec.py")
-
-class MultibyteStreamReader(object):
- def __init__(self, *args, **kwds):
- raise LookupError(
- "MultibyteStreamReader not implemented; "
- "see pypy/module/_multibytecodec/app_multibytecodec.py")
-
-class MultibyteStreamWriter(object):
- def __init__(self, *args, **kwds):
- raise LookupError(
- "MultibyteStreamWriter not implemented; "
- "see pypy/module/_multibytecodec/app_multibytecodec.py")
+ def writelines(self, lines):
+ for data in lines:
+ self.write(data)
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
index 6756d97dc7..d0ab9a532b 100644
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -52,11 +52,13 @@ eci = ExternalCompilationInfo(
includes = ['src/cjkcodecs/multibytecodec.h'],
include_dirs = [str(srcdir)],
export_symbols = [
+ "pypy_cjk_dec_new",
"pypy_cjk_dec_init", "pypy_cjk_dec_free", "pypy_cjk_dec_chunk",
"pypy_cjk_dec_outbuf", "pypy_cjk_dec_outlen",
"pypy_cjk_dec_inbuf_remaining", "pypy_cjk_dec_inbuf_consumed",
"pypy_cjk_dec_replace_on_error",
+ "pypy_cjk_enc_new",
"pypy_cjk_enc_init", "pypy_cjk_enc_free", "pypy_cjk_enc_chunk",
"pypy_cjk_enc_reset", "pypy_cjk_enc_outbuf", "pypy_cjk_enc_outlen",
"pypy_cjk_enc_inbuf_remaining", "pypy_cjk_enc_inbuf_consumed",
@@ -92,9 +94,11 @@ def getcodec(name):
# Decoding
DECODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_dec_s', compilation_info=eci)
+pypy_cjk_dec_new = llexternal('pypy_cjk_dec_new',
+ [MULTIBYTECODEC_P], DECODEBUF_P)
pypy_cjk_dec_init = llexternal('pypy_cjk_dec_init',
- [MULTIBYTECODEC_P, rffi.CCHARP, rffi.SSIZE_T],
- DECODEBUF_P)
+ [DECODEBUF_P, rffi.CCHARP, rffi.SSIZE_T],
+ rffi.SSIZE_T)
pypy_cjk_dec_free = llexternal('pypy_cjk_dec_free', [DECODEBUF_P],
lltype.Void)
pypy_cjk_dec_chunk = llexternal('pypy_cjk_dec_chunk', [DECODEBUF_P],
@@ -113,25 +117,30 @@ pypy_cjk_dec_replace_on_error = llexternal('pypy_cjk_dec_replace_on_error',
rffi.SSIZE_T)
def decode(codec, stringdata, errors="strict", errorcb=None, namecb=None):
+ decodebuf = pypy_cjk_dec_new(codec)
+ if not decodebuf:
+ raise MemoryError
+ try:
+ return decodeex(decodebuf, stringdata, errors, errorcb, namecb)
+ finally:
+ pypy_cjk_dec_free(decodebuf)
+
+def decodeex(decodebuf, stringdata, errors="strict", errorcb=None, namecb=None,
+ ignore_error=0):
inleft = len(stringdata)
inbuf = rffi.get_nonmovingbuffer(stringdata)
try:
- decodebuf = pypy_cjk_dec_init(codec, inbuf, inleft)
- if not decodebuf:
+ if pypy_cjk_dec_init(decodebuf, inbuf, inleft) < 0:
raise MemoryError
- try:
- while True:
- r = pypy_cjk_dec_chunk(decodebuf)
- if r == 0:
- break
- multibytecodec_decerror(decodebuf, r, errors,
- errorcb, namecb, stringdata)
- src = pypy_cjk_dec_outbuf(decodebuf)
- length = pypy_cjk_dec_outlen(decodebuf)
- return rffi.wcharpsize2unicode(src, length)
- #
- finally:
- pypy_cjk_dec_free(decodebuf)
+ while True:
+ r = pypy_cjk_dec_chunk(decodebuf)
+ if r == 0 or r == ignore_error:
+ break
+ multibytecodec_decerror(decodebuf, r, errors,
+ errorcb, namecb, stringdata)
+ src = pypy_cjk_dec_outbuf(decodebuf)
+ length = pypy_cjk_dec_outlen(decodebuf)
+ return rffi.wcharpsize2unicode(src, length)
#
finally:
rffi.free_nonmovingbuffer(stringdata, inbuf)
@@ -174,9 +183,11 @@ def multibytecodec_decerror(decodebuf, e, errors,
# ____________________________________________________________
# Encoding
ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci)
+pypy_cjk_enc_new = llexternal('pypy_cjk_enc_new',
+ [MULTIBYTECODEC_P], ENCODEBUF_P)
pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init',
- [MULTIBYTECODEC_P, rffi.CWCHARP, rffi.SSIZE_T],
- ENCODEBUF_P)
+ [ENCODEBUF_P, rffi.CWCHARP, rffi.SSIZE_T],
+ rffi.SSIZE_T)
pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P],
lltype.Void)
pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk', [ENCODEBUF_P],
@@ -195,39 +206,46 @@ pypy_cjk_enc_replace_on_error = llexternal('pypy_cjk_enc_replace_on_error',
[ENCODEBUF_P, rffi.CCHARP,
rffi.SSIZE_T, rffi.SSIZE_T],
rffi.SSIZE_T)
+pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
+ [ENCODEBUF_P], MULTIBYTECODEC_P)
def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+ encodebuf = pypy_cjk_enc_new(codec)
+ if not encodebuf:
+ raise MemoryError
+ try:
+ return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+ finally:
+ pypy_cjk_enc_free(encodebuf)
+
+def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+ namecb=None, ignore_error=0):
inleft = len(unicodedata)
inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
try:
- encodebuf = pypy_cjk_enc_init(codec, inbuf, inleft)
- if not encodebuf:
+ if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
raise MemoryError
- try:
- while True:
- r = pypy_cjk_enc_chunk(encodebuf)
- if r == 0:
- break
- multibytecodec_encerror(encodebuf, r, errors,
- codec, errorcb, namecb, unicodedata)
- while True:
- r = pypy_cjk_enc_reset(encodebuf)
- if r == 0:
- break
- multibytecodec_encerror(encodebuf, r, errors,
- codec, errorcb, namecb, unicodedata)
- src = pypy_cjk_enc_outbuf(encodebuf)
- length = pypy_cjk_enc_outlen(encodebuf)
- return rffi.charpsize2str(src, length)
- #
- finally:
- pypy_cjk_enc_free(encodebuf)
+ while True:
+ r = pypy_cjk_enc_chunk(encodebuf)
+ if r == 0 or r == ignore_error:
+ break
+ multibytecodec_encerror(encodebuf, r, errors,
+ errorcb, namecb, unicodedata)
+ while True:
+ r = pypy_cjk_enc_reset(encodebuf)
+ if r == 0:
+ break
+ multibytecodec_encerror(encodebuf, r, errors,
+ errorcb, namecb, unicodedata)
+ src = pypy_cjk_enc_outbuf(encodebuf)
+ length = pypy_cjk_enc_outlen(encodebuf)
+ return rffi.charpsize2str(src, length)
#
finally:
rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
def multibytecodec_encerror(encodebuf, e, errors,
- codec, errorcb, namecb, unicodedata):
+ errorcb, namecb, unicodedata):
if e > 0:
reason = "illegal multibyte sequence"
esize = e
@@ -248,6 +266,7 @@ def multibytecodec_encerror(encodebuf, e, errors,
elif errors == "ignore":
replace = ""
elif errors == "replace":
+ codec = pypy_cjk_enc_getcodec(encodebuf)
try:
replace = encode(codec, u"?")
except EncodeDecodeError:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
new file mode 100644
index 0000000000..d83e4bf660
--- /dev/null
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -0,0 +1,141 @@
+from pypy.rpython.lltypesystem import lltype
+from pypy.module._multibytecodec import c_codecs
+from pypy.module._multibytecodec.interp_multibytecodec import (
+ MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
+ wrap_unicodeencodeerror)
+from pypy.interpreter.baseobjspace import Wrappable
+from pypy.interpreter.gateway import interp2app, unwrap_spec
+from pypy.interpreter.typedef import TypeDef, GetSetProperty
+from pypy.module._codecs.interp_codecs import CodecState
+
+
+class MultibyteIncrementalBase(Wrappable):
+
+ def __init__(self, space, errors):
+ if errors is None:
+ errors = 'strict'
+ self.space = space
+ self.errors = errors
+ w_codec = space.getattr(space.wrap(self), space.wrap("codec"))
+ codec = space.interp_w(MultibyteCodec, w_codec)
+ self.codec = codec.codec
+ self.name = codec.name
+ self._initialize()
+
+ def __del__(self):
+ self._free()
+
+ def reset_w(self):
+ self._free()
+ self._initialize()
+
+ def fget_errors(self, space):
+ return space.wrap(self.errors)
+
+ def fset_errors(self, space, w_errors):
+ self.errors = space.str_w(w_errors)
+
+
+class MultibyteIncrementalDecoder(MultibyteIncrementalBase):
+
+ def _initialize(self):
+ self.decodebuf = c_codecs.pypy_cjk_dec_new(self.codec)
+ self.pending = ""
+
+ def _free(self):
+ self.pending = None
+ if self.decodebuf:
+ c_codecs.pypy_cjk_dec_free(self.decodebuf)
+ self.decodebuf = lltype.nullptr(c_codecs.DECODEBUF_P.TO)
+
+ @unwrap_spec(object=str, final=bool)
+ def decode_w(self, object, final=False):
+ space = self.space
+ state = space.fromcache(CodecState)
+ if len(self.pending) > 0:
+ object = self.pending + object
+ try:
+ output = c_codecs.decodeex(self.decodebuf, object, self.errors,
+ state.decode_error_handler, self.name,
+ get_ignore_error(final))
+ except c_codecs.EncodeDecodeError, e:
+ raise wrap_unicodedecodeerror(space, e, object, self.name)
+ except RuntimeError:
+ raise wrap_runtimeerror(space)
+ pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
+ assert 0 <= pos <= len(object)
+ self.pending = object[pos:]
+ return space.wrap(output)
+
+
+@unwrap_spec(errors="str_or_None")
+def mbidecoder_new(space, w_subtype, errors=None):
+ r = space.allocate_instance(MultibyteIncrementalDecoder, w_subtype)
+ r.__init__(space, errors)
+ return space.wrap(r)
+
+MultibyteIncrementalDecoder.typedef = TypeDef(
+ 'MultibyteIncrementalDecoder',
+ __module__ = '_multibytecodec',
+ __new__ = interp2app(mbidecoder_new),
+ decode = interp2app(MultibyteIncrementalDecoder.decode_w),
+ reset = interp2app(MultibyteIncrementalDecoder.reset_w),
+ errors = GetSetProperty(MultibyteIncrementalDecoder.fget_errors,
+ MultibyteIncrementalDecoder.fset_errors),
+ )
+
+
+class MultibyteIncrementalEncoder(MultibyteIncrementalBase):
+
+ def _initialize(self):
+ self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
+ self.pending = u""
+
+ def _free(self):
+ self.pending = None
+ if self.encodebuf:
+ c_codecs.pypy_cjk_enc_free(self.encodebuf)
+ self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
+
+ @unwrap_spec(object=unicode, final=bool)
+ def encode_w(self, object, final=False):
+ space = self.space
+ state = space.fromcache(CodecState)
+ if len(self.pending) > 0:
+ object = self.pending + object
+ try:
+ output = c_codecs.encodeex(self.encodebuf, object, self.errors,
+ state.encode_error_handler, self.name,
+ get_ignore_error(final))
+ except c_codecs.EncodeDecodeError, e:
+ raise wrap_unicodeencodeerror(space, e, object, self.name)
+ except RuntimeError:
+ raise wrap_runtimeerror(space)
+ pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
+ assert 0 <= pos <= len(object)
+ self.pending = object[pos:]
+ return space.wrap(output)
+
+
+@unwrap_spec(errors="str_or_None")
+def mbiencoder_new(space, w_subtype, errors=None):
+ r = space.allocate_instance(MultibyteIncrementalEncoder, w_subtype)
+ r.__init__(space, errors)
+ return space.wrap(r)
+
+MultibyteIncrementalEncoder.typedef = TypeDef(
+ 'MultibyteIncrementalEncoder',
+ __module__ = '_multibytecodec',
+ __new__ = interp2app(mbiencoder_new),
+ encode = interp2app(MultibyteIncrementalEncoder.encode_w),
+ reset = interp2app(MultibyteIncrementalEncoder.reset_w),
+ errors = GetSetProperty(MultibyteIncrementalEncoder.fget_errors,
+ MultibyteIncrementalEncoder.fset_errors),
+ )
+
+
+def get_ignore_error(final):
+ if final:
+ return 0 # don't ignore any error
+ else:
+ return c_codecs.MBERR_TOOFEW
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
index 6ffb9a8fa6..46b540c139 100644
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -22,17 +22,9 @@ class MultibyteCodec(Wrappable):
output = c_codecs.decode(self.codec, input, errors,
state.decode_error_handler, self.name)
except c_codecs.EncodeDecodeError, e:
- raise OperationError(
- space.w_UnicodeDecodeError,
- space.newtuple([
- space.wrap(self.name),
- space.wrap(input),
- space.wrap(e.start),
- space.wrap(e.end),
- space.wrap(e.reason)]))
+ raise wrap_unicodedecodeerror(space, e, input, self.name)
except RuntimeError:
- raise OperationError(space.w_RuntimeError,
- space.wrap("internal codec error"))
+ raise wrap_runtimeerror(space)
return space.newtuple([space.wrap(output),
space.wrap(len(input))])
@@ -46,17 +38,9 @@ class MultibyteCodec(Wrappable):
output = c_codecs.encode(self.codec, input, errors,
state.encode_error_handler, self.name)
except c_codecs.EncodeDecodeError, e:
- raise OperationError(
- space.w_UnicodeEncodeError,
- space.newtuple([
- space.wrap(self.name),
- space.wrap(input),
- space.wrap(e.start),
- space.wrap(e.end),
- space.wrap(e.reason)]))
+ raise wrap_unicodeencodeerror(space, e, input, self.name)
except RuntimeError:
- raise OperationError(space.w_RuntimeError,
- space.wrap("internal codec error"))
+ raise wrap_runtimeerror(space)
return space.newtuple([space.wrap(output),
space.wrap(len(input))])
@@ -78,3 +62,28 @@ def getcodec(space, name):
raise OperationError(space.w_LookupError,
space.wrap("no such codec is supported."))
return space.wrap(MultibyteCodec(name, codec))
+
+
+def wrap_unicodedecodeerror(space, e, input, name):
+ return OperationError(
+ space.w_UnicodeDecodeError,
+ space.newtuple([
+ space.wrap(name),
+ space.wrap(input),
+ space.wrap(e.start),
+ space.wrap(e.end),
+ space.wrap(e.reason)]))
+
+def wrap_unicodeencodeerror(space, e, input, name):
+ raise OperationError(
+ space.w_UnicodeEncodeError,
+ space.newtuple([
+ space.wrap(name),
+ space.wrap(input),
+ space.wrap(e.start),
+ space.wrap(e.end),
+ space.wrap(e.reason)]))
+
+def wrap_runtimeerror(space):
+ raise OperationError(space.w_RuntimeError,
+ space.wrap("internal codec error"))
diff --git a/pypy/module/_multibytecodec/test/test_app_incremental.py b/pypy/module/_multibytecodec/test/test_app_incremental.py
new file mode 100644
index 0000000000..7fd96eccdd
--- /dev/null
+++ b/pypy/module/_multibytecodec/test/test_app_incremental.py
@@ -0,0 +1,138 @@
+from pypy.conftest import gettestobjspace
+
+
+class AppTestClasses:
+ def setup_class(cls):
+ cls.space = gettestobjspace(usemodules=['_multibytecodec'])
+ cls.w_IncrementalHzDecoder = cls.space.appexec([], """():
+ import _codecs_cn
+ from _multibytecodec import MultibyteIncrementalDecoder
+
+ class IncrementalHzDecoder(MultibyteIncrementalDecoder):
+ codec = _codecs_cn.getcodec('hz')
+
+ return IncrementalHzDecoder
+ """)
+ cls.w_IncrementalHzEncoder = cls.space.appexec([], """():
+ import _codecs_cn
+ from _multibytecodec import MultibyteIncrementalEncoder
+
+ class IncrementalHzEncoder(MultibyteIncrementalEncoder):
+ codec = _codecs_cn.getcodec('hz')
+
+ return IncrementalHzEncoder
+ """)
+
+ def test_decode_hz(self):
+ d = self.IncrementalHzDecoder()
+ r = d.decode("~{abcd~}")
+ assert r == u'\u5f95\u6c85'
+ r = d.decode("~{efgh~}")
+ assert r == u'\u5f50\u73b7'
+ for c, output in zip("!~{abcd~}xyz~{efgh",
+ [u'!', # !
+ u'', # ~
+ u'', # {
+ u'', # a
+ u'\u5f95', # b
+ u'', # c
+ u'\u6c85', # d
+ u'', # ~
+ u'', # }
+ u'x', # x
+ u'y', # y
+ u'z', # z
+ u'', # ~
+ u'', # {
+ u'', # e
+ u'\u5f50', # f
+ u'', # g
+ u'\u73b7', # h
+ ]):
+ r = d.decode(c)
+ assert r == output
+
+ def test_decode_hz_final(self):
+ d = self.IncrementalHzDecoder()
+ r = d.decode("~{", True)
+ assert r == u''
+ raises(UnicodeDecodeError, d.decode, "~", True)
+ raises(UnicodeDecodeError, d.decode, "~{a", True)
+
+ def test_decode_hz_reset(self):
+ d = self.IncrementalHzDecoder()
+ r = d.decode("ab")
+ assert r == u'ab'
+ r = d.decode("~{")
+ assert r == u''
+ r = d.decode("ab")
+ assert r == u'\u5f95'
+ r = d.decode("ab")
+ assert r == u'\u5f95'
+ d.reset()
+ r = d.decode("ab")
+ assert r == u'ab'
+
+ def test_decode_hz_error(self):
+ d = self.IncrementalHzDecoder()
+ raises(UnicodeDecodeError, d.decode, "~{abc", True)
+ d = self.IncrementalHzDecoder("ignore")
+ r = d.decode("~{abc", True)
+ assert r == u'\u5f95'
+ d = self.IncrementalHzDecoder()
+ d.errors = "replace"
+ r = d.decode("~{abc", True)
+ assert r == u'\u5f95\ufffd'
+
+ def test_decode_hz_buffer_grow(self):
+ d = self.IncrementalHzDecoder()
+ for i in range(13):
+ r = d.decode("a" * (2**i))
+ assert r == u"a" * (2**i)
+
+ def test_encode_hz(self):
+ e = self.IncrementalHzEncoder()
+ r = e.encode("abcd")
+ assert r == 'abcd'
+ r = e.encode(u"\u5f95\u6c85")
+ assert r == '~{abcd~}'
+ r = e.encode(u"\u5f50")
+ assert r == '~{ef~}'
+ r = e.encode(u"\u73b7")
+ assert r == '~{gh~}'
+
+ def test_encode_hz_final(self):
+ e = self.IncrementalHzEncoder()
+ r = e.encode(u"xyz\u5f95\u6c85", True)
+ assert r == 'xyz~{abcd~}'
+ # This is a bit hard to test, because the only way I can see that
+ # encoders can return MBERR_TOOFEW is with surrogates, which only
+ # occur with 2-byte unicode characters... We will just have to
+ # trust that the logic works, because it is exactly the same one
+ # as in the decode case :-/
+
+ def test_encode_hz_reset(self):
+ # Same issue as with test_encode_hz_final
+ e = self.IncrementalHzEncoder()
+ r = e.encode(u"xyz\u5f95\u6c85", True)
+ assert r == 'xyz~{abcd~}'
+ e.reset()
+ r = e.encode(u"xyz\u5f95\u6c85")
+ assert r == 'xyz~{abcd~}'
+
+ def test_encode_hz_error(self):
+ e = self.IncrementalHzEncoder()
+ raises(UnicodeEncodeError, e.encode, u"\u4321", True)
+ e = self.IncrementalHzEncoder("ignore")
+ r = e.encode(u"xy\u4321z", True)
+ assert r == 'xyz'
+ e = self.IncrementalHzEncoder()
+ e.errors = "replace"
+ r = e.encode(u"xy\u4321z", True)
+ assert r == 'xy?z'
+
+ def test_encode_hz_buffer_grow(self):
+ e = self.IncrementalHzEncoder()
+ for i in range(13):
+ r = e.encode(u"a" * (2**i))
+ assert r == "a" * (2**i)
diff --git a/pypy/module/_multibytecodec/test/test_app_stream.py b/pypy/module/_multibytecodec/test/test_app_stream.py
new file mode 100644
index 0000000000..253c6ce66f
--- /dev/null
+++ b/pypy/module/_multibytecodec/test/test_app_stream.py
@@ -0,0 +1,71 @@
+from pypy.conftest import gettestobjspace
+
+
+class AppTestStreams:
+ def setup_class(cls):
+ cls.space = gettestobjspace(usemodules=['_multibytecodec'])
+ cls.w_HzStreamReader = cls.space.appexec([], """():
+ import _codecs_cn
+ from _multibytecodec import MultibyteStreamReader
+
+ class HzStreamReader(MultibyteStreamReader):
+ codec = _codecs_cn.getcodec('hz')
+
+ return HzStreamReader
+ """)
+ cls.w_HzStreamWriter = cls.space.appexec([], """():
+ import _codecs_cn
+ from _multibytecodec import MultibyteStreamWriter
+
+ class HzStreamWriter(MultibyteStreamWriter):
+ codec = _codecs_cn.getcodec('hz')
+
+ return HzStreamWriter
+ """)
+
+ def test_reader(self):
+ class FakeFile:
+ def __init__(self, data):
+ self.data = data
+ self.pos = 0
+ def read(self, size):
+ res = self.data[self.pos : self.pos + size]
+ self.pos += size
+ return res
+ #
+ r = self.HzStreamReader(FakeFile("!~{abcd~}xyz~{efgh"))
+ for expected in u'!\u5f95\u6c85xyz\u5f50\u73b7':
+ c = r.read(1)
+ assert c == expected
+ c = r.read(1)
+ assert c == ''
+
+ def test_reader_replace(self):
+ class FakeFile:
+ def __init__(self, data):
+ self.data = data
+ def read(self):
+ return self.data
+ #
+ r = self.HzStreamReader(FakeFile("!~{a"), "replace")
+ c = r.read()
+ assert c == u'!\ufffd'
+ #
+ r = self.HzStreamReader(FakeFile("!~{a"))
+ r.errors = "replace"
+ assert r.errors == "replace"
+ c = r.read()
+ assert c == u'!\ufffd'
+
+ def test_writer(self):
+ class FakeFile:
+ def __init__(self):
+ self.output = []
+ def write(self, data):
+ self.output.append(data)
+ #
+ w = self.HzStreamWriter(FakeFile())
+ for input in u'!\u5f95\u6c85xyz\u5f50\u73b7':
+ w.write(input)
+ assert w.stream.output == ['!', '~{ab~}', '~{cd~}', 'x', 'y', 'z',
+ '~{ef~}', '~{gh~}']
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
index e71668faa3..359241a377 100644
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -2,6 +2,7 @@ import py
from pypy.module._multibytecodec.c_codecs import getcodec, codecs
from pypy.module._multibytecodec.c_codecs import decode, encode
from pypy.module._multibytecodec.c_codecs import EncodeDecodeError
+from pypy.module._multibytecodec import c_codecs
def test_codecs_existence():
@@ -22,6 +23,52 @@ def test_decode_hz():
c = getcodec("hz")
u = decode(c, "~{abc}")
assert u == u'\u5f95\u6cef'
+ u = decode(c, "~{")
+ assert u == u''
+
+def test_decodeex_hz():
+ c = getcodec("hz")
+ decodebuf = c_codecs.pypy_cjk_dec_new(c)
+ u = c_codecs.decodeex(decodebuf, "~{abcd~}")
+ assert u == u'\u5f95\u6c85'
+ u = c_codecs.decodeex(decodebuf, "~{efgh~}")
+ assert u == u'\u5f50\u73b7'
+ u = c_codecs.decodeex(decodebuf, "!~{abcd~}xyz~{efgh")
+ assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'
+ c_codecs.pypy_cjk_dec_free(decodebuf)
+
+def test_decodeex_hz_incomplete():
+ c = getcodec("hz")
+ decodebuf = c_codecs.pypy_cjk_dec_new(c)
+ buf = ''
+ for c, output in zip("!~{abcd~}xyz~{efgh",
+ [u'!', # !
+ u'', # ~
+ u'', # {
+ u'', # a
+ u'\u5f95', # b
+ u'', # c
+ u'\u6c85', # d
+ u'', # ~
+ u'', # }
+ u'x', # x
+ u'y', # y
+ u'z', # z
+ u'', # ~
+ u'', # {
+ u'', # e
+ u'\u5f50', # f
+ u'', # g
+ u'\u73b7', # h
+ ]):
+ buf += c
+ u = c_codecs.decodeex(decodebuf, buf,
+ ignore_error = c_codecs.MBERR_TOOFEW)
+ assert u == output
+ incompletepos = c_codecs.pypy_cjk_dec_inbuf_consumed(decodebuf)
+ buf = buf[incompletepos:]
+ assert buf == ''
+ c_codecs.pypy_cjk_dec_free(decodebuf)
def test_decode_hz_error():
# error
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
index 8625bc04f5..f4d1e416e3 100644
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -3,31 +3,38 @@
#include "src/cjkcodecs/multibytecodec.h"
-struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec,
- char *inbuf, Py_ssize_t inlen)
+struct pypy_cjk_dec_s *pypy_cjk_dec_new(const MultibyteCodec *codec)
{
struct pypy_cjk_dec_s *d = malloc(sizeof(struct pypy_cjk_dec_s));
if (!d)
return NULL;
if (codec->decinit != NULL && codec->decinit(&d->state, codec->config) != 0)
- goto errorexit;
-
+ {
+ free(d);
+ return NULL;
+ }
d->codec = codec;
+ d->outbuf_start = NULL;
+ return d;
+}
+
+Py_ssize_t pypy_cjk_dec_init(struct pypy_cjk_dec_s *d,
+ char *inbuf, Py_ssize_t inlen)
+{
d->inbuf_start = inbuf;
d->inbuf = inbuf;
d->inbuf_end = inbuf + inlen;
- d->outbuf_start = (inlen <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) ?
- malloc(inlen * sizeof(Py_UNICODE)) :
- NULL);
- if (!d->outbuf_start)
- goto errorexit;
+ if (d->outbuf_start == NULL)
+ {
+ d->outbuf_start = (inlen <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) ?
+ malloc(inlen * sizeof(Py_UNICODE)) :
+ NULL);
+ if (d->outbuf_start == NULL)
+ return -1;
+ d->outbuf_end = d->outbuf_start + inlen;
+ }
d->outbuf = d->outbuf_start;
- d->outbuf_end = d->outbuf_start + inlen;
- return d;
-
- errorexit:
- free(d);
- return NULL;
+ return 0;
}
void pypy_cjk_dec_free(struct pypy_cjk_dec_s *d)
@@ -112,34 +119,40 @@ Py_ssize_t pypy_cjk_dec_replace_on_error(struct pypy_cjk_dec_s* d,
/************************************************************/
-struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
- Py_UNICODE *inbuf, Py_ssize_t inlen)
+struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec)
{
- Py_ssize_t outlen;
struct pypy_cjk_enc_s *d = malloc(sizeof(struct pypy_cjk_enc_s));
if (!d)
return NULL;
if (codec->encinit != NULL && codec->encinit(&d->state, codec->config) != 0)
- goto errorexit;
-
+ {
+ free(d);
+ return NULL;
+ }
d->codec = codec;
+ d->outbuf_start = NULL;
+ return d;
+}
+
+Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
+ Py_UNICODE *inbuf, Py_ssize_t inlen)
+{
+ Py_ssize_t outlen;
d->inbuf_start = inbuf;
d->inbuf = inbuf;
d->inbuf_end = inbuf + inlen;
-
- if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
- goto errorexit;
- outlen = inlen * 2 + 16;
- d->outbuf_start = malloc(outlen);
- if (!d->outbuf_start)
- goto errorexit;
+ if (d->outbuf_start == NULL)
+ {
+ if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
+ return -1;
+ outlen = inlen * 2 + 16;
+ d->outbuf_start = malloc(outlen);
+ if (d->outbuf_start == NULL)
+ return -1;
+ d->outbuf_end = d->outbuf_start + outlen;
+ }
d->outbuf = d->outbuf_start;
- d->outbuf_end = d->outbuf_start + outlen;
- return d;
-
- errorexit:
- free(d);
- return NULL;
+ return 0;
}
void pypy_cjk_enc_free(struct pypy_cjk_enc_s *d)
@@ -242,3 +255,8 @@ Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d,
d->inbuf = d->inbuf_start + in_offset;
return 0;
}
+
+const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *d)
+{
+ return d->codec;
+}
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
index 8f59d14c8d..6d79acda43 100644
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -94,8 +94,9 @@ struct pypy_cjk_dec_s {
Py_UNICODE *outbuf_start, *outbuf, *outbuf_end;
};
-struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec,
- char *inbuf, Py_ssize_t inlen);
+struct pypy_cjk_dec_s *pypy_cjk_dec_new(const MultibyteCodec *codec);
+Py_ssize_t pypy_cjk_dec_init(struct pypy_cjk_dec_s *d,
+ char *inbuf, Py_ssize_t inlen);
void pypy_cjk_dec_free(struct pypy_cjk_dec_s *);
Py_ssize_t pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *);
Py_UNICODE *pypy_cjk_dec_outbuf(struct pypy_cjk_dec_s *);
@@ -112,8 +113,9 @@ struct pypy_cjk_enc_s {
unsigned char *outbuf_start, *outbuf, *outbuf_end;
};
-struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
- Py_UNICODE *inbuf, Py_ssize_t inlen);
+struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec);
+Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
+ Py_UNICODE *inbuf, Py_ssize_t inlen);
void pypy_cjk_enc_free(struct pypy_cjk_enc_s *);
Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *);
Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *);
@@ -123,6 +125,7 @@ Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d);
Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d,
char *, Py_ssize_t, Py_ssize_t);
+const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *);
/* list of codecs defined in the .c files */