Discussion:
[VM] [XEMACS PATCH] Have coding systems do some bytecount->charcount work
Aidan Kehoe
2013-12-19 11:14:48 UTC
Permalink
With large non-conversion-unix files (in particular, VM buffers), 10% of the
time needed to read them in is spent in bytecount_to_charcount_func(),
working out the byte-character correspondence for the buffer code.

The coding systems in general know exactly where the character boundaries
are, though, and if they record it there’s no need for the buffer insertion
code to do that work. The below patch gives noticeably snappier performance
for me loading large files with reasonable amounts of non-ASCII characters.
It is very very much not ready to commit, I post it to show the idea and
because I don’t anticipate I’ll get to finishing it this month.

diff -r 94a6b8fbd56e src/file-coding.c
--- a/src/file-coding.c Tue Dec 17 20:49:52 2013 +0200
+++ b/src/file-coding.c Thu Dec 19 10:47:01 2013 +0000
@@ -1990,6 +1990,14 @@
return Lstream_seekable_p (str->other_end);
}

+static Charcount
+coding_character_tell (Lstream *stream)
+{
+ struct coding_stream *str = CODING_STREAM_DATA (stream);
+
+ return XCODESYSMETH_OR_GIVEN (str->codesys, character_tell, (str), -1);
+}
+
static int
coding_flusher (Lstream *stream)
{
@@ -2823,7 +2831,31 @@

#### Shouldn't we _call_ it that, then? And while we're at it,
separate it into "to_internal" and "to_external"? */
-DEFINE_CODING_SYSTEM_TYPE (no_conversion);
+
+
+struct no_conversion_coding_system
+{
+};
+
+struct no_conversion_coding_stream
+{
+ Charcount characters_seen;
+};
+
+static const struct memory_description no_conversion_coding_system_description[] = {
+ { XD_END }
+};
+
+static const struct memory_description no_conversion_coding_stream_description_1 [] = {
+ { XD_INT, offsetof (struct no_conversion_coding_stream, characters_seen) },
+ { XD_END }
+};
+
+const struct sized_memory_description no_conversion_coding_stream_description = {
+ sizeof (struct no_conversion_coding_stream), no_conversion_coding_stream_description_1
+};
+
+DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (no_conversion);

/* This is used when reading in "binary" files -- i.e. files that may
contain all 256 possible byte values and that are not to be
@@ -2846,12 +2878,14 @@
DECODE_ADD_BINARY_CHAR (c, dst);
}

+ CODING_STREAM_TYPE_DATA (str, no_conversion)->characters_seen
+ += orign;
+
if (str->eof)
DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
}
else
{
-
while (n--)
{
c = *src++;
@@ -2893,6 +2927,13 @@
return orign;
}

+static Charcount
+no_conversion_character_tell (struct coding_stream *str)
+{
+#warning "examine the coding character mode too"n
+ return CODING_STREAM_TYPE_DATA (str, no_conversion)->characters_seen;
+}
+
DEFINE_DETECTOR (no_conversion);
DEFINE_DETECTOR_CATEGORY (no_conversion, no_conversion);

@@ -4645,6 +4686,7 @@
LSTREAM_HAS_METHOD (coding, writer);
LSTREAM_HAS_METHOD (coding, rewinder);
LSTREAM_HAS_METHOD (coding, seekable_p);
+ LSTREAM_HAS_METHOD (coding, character_tell);
LSTREAM_HAS_METHOD (coding, marker);
LSTREAM_HAS_METHOD (coding, flusher);
LSTREAM_HAS_METHOD (coding, closer);
@@ -4686,9 +4728,10 @@
dump_add_opaque_int (&coding_detector_count);
dump_add_opaque_int (&coding_detector_category_count);

- INITIALIZE_CODING_SYSTEM_TYPE (no_conversion,
- "no-conversion-coding-system-p");
+ INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (no_conversion,
+ "no-conversion-coding-system-p");
CODING_SYSTEM_HAS_METHOD (no_conversion, convert);
+ CODING_SYSTEM_HAS_METHOD (no_conversion, character_tell);

INITIALIZE_DETECTOR (no_conversion);
DETECTOR_HAS_METHOD (no_conversion, detect);
diff -r 94a6b8fbd56e src/file-coding.h
--- a/src/file-coding.h Tue Dec 17 20:49:52 2013 +0200
+++ b/src/file-coding.h Thu Dec 19 10:47:01 2013 +0000
@@ -353,6 +353,9 @@
a result of the stream being rewound. Optional. */
void (*rewind_coding_stream_method) (struct coding_stream *str);

+ /* Return the number of characters processed. Optional. */
+ Charcount (*character_tell_method) (struct coding_stream *str);
+
/* Finalize coding stream method: Clean up the type-specific data
attached to the coding stream (i.e. in struct TYPE_coding_stream).
Happens when the Lstream is deleted using Lstream_delete() or is
@@ -1109,4 +1112,3 @@
int given);

#endif /* INCLUDED_file_coding_h_ */
-
diff -r 94a6b8fbd56e src/fileio.c
--- a/src/fileio.c Tue Dec 17 20:49:52 2013 +0200
+++ b/src/fileio.c Thu Dec 19 10:47:01 2013 +0000
@@ -3196,7 +3196,8 @@
while (1)
{
Bytecount this_len;
- Charcount cc_inserted;
+ Charcount last_tell
+ = Lstream_character_tell (XLSTREAM (stream)), cc_inserted;

QUIT;
this_len = Lstream_read (XLSTREAM (stream), read_buf,
@@ -3209,10 +3210,13 @@
break;
}

- cc_inserted = buffer_insert_raw_string_1 (buf, cur_point, read_buf,
- this_len,
- !NILP (visit)
- ? INSDEL_NO_LOCKING : 0);
+ cc_inserted
+ = buffer_insert_string_1 (buf, cur_point, read_buf, Qnil,
+ 0, this_len, last_tell > 0
+ ? Lstream_character_tell (XLSTREAM
+ (stream))
+ - last_tell : -1,
+ !NILP (visit) ? INSDEL_NO_LOCKING : 0);
inserted += cc_inserted;
cur_point += cc_inserted;
}
diff -r 94a6b8fbd56e src/insdel.c
--- a/src/insdel.c Tue Dec 17 20:49:52 2013 +0200
+++ b/src/insdel.c Thu Dec 19 10:47:01 2013 +0000
@@ -1061,13 +1061,12 @@
buffer_insert_string_1 (struct buffer *buf, Charbpos pos,
const Ibyte *nonreloc, Lisp_Object reloc,
Bytecount offset, Bytecount length,
- int flags)
+ Charcount cclen, int flags)
{
/* This function can GC */
struct gcpro gcpro1;
Bytebpos bytepos;
Bytecount length_in_buffer;
- Charcount cclen;
int move_point = 0;
struct buffer *mbuf;
Lisp_Object bufcons;
@@ -1118,14 +1117,30 @@

bytepos = charbpos_to_bytebpos (buf, pos);

- /* string may have been relocated up to this point */
- if (STRINGP (reloc))
+ if (cclen < 0)
{
- cclen = string_offset_byte_to_char_len (reloc, offset, length);
- nonreloc = XSTRING_DATA (reloc);
+ /* string may have been relocated up to this point */
+ if (STRINGP (reloc))
+ {
+ cclen = string_offset_byte_to_char_len (reloc, offset, length);
+ nonreloc = XSTRING_DATA (reloc);
+ }
+ else
+ cclen = bytecount_to_charcount (nonreloc + offset, length);
}
+#ifdef ERROR_CHECK_TEXT
else
- cclen = bytecount_to_charcount (nonreloc + offset, length);
+ {
+ text_checking_assert (cclen
+ == (STRINGP (reloc) ?
+ string_offset_byte_to_char_len (reloc,
+ offset, length)
+ : bytecount_to_charcount (nonreloc + offset,
+ length)));
+
+ }
+#endif
+
/* &&#### Here we check if the text can't fit into the format of the buffer,
and if so convert it to another format (either default or 32-bit-fixed,
according to some flag; if no flag, use default). */
@@ -1286,7 +1301,7 @@
{
/* This function can GC */
return buffer_insert_string_1 (buf, pos, nonreloc, Qnil, 0, length,
- flags);
+ -1, flags);
}

Charcount
@@ -1295,8 +1310,7 @@
{
/* This function can GC */
return buffer_insert_string_1 (buf, pos, 0, str, 0,
- XSTRING_LENGTH (str),
- flags);
+ XSTRING_LENGTH (str), -1, flags);
}

/* Insert the null-terminated string S (in external format). */
@@ -1309,7 +1323,7 @@
const CIbyte *translated = GETTEXT (s);
ASSERT_ASCTEXT_ASCII (s);
return buffer_insert_string_1 (buf, pos, (const Ibyte *) translated, Qnil,
- 0, strlen (translated), flags);
+ 0, strlen (translated), -1, flags);
}

Charcount
@@ -1319,7 +1333,7 @@
/* This function can GC */
Ibyte str[MAX_ICHAR_LEN];
Bytecount len = set_itext_ichar (str, ch);
- return buffer_insert_string_1 (buf, pos, str, Qnil, 0, len, flags);
+ return buffer_insert_string_1 (buf, pos, str, Qnil, 0, len, -1, flags);
}

Charcount
@@ -1339,7 +1353,7 @@
/* This function can GC */
Lisp_Object str = make_string_from_buffer (buf2, pos2, length);
return buffer_insert_string_1 (buf, pos, 0, str, 0,
- XSTRING_LENGTH (str), flags);
+ XSTRING_LENGTH (str), -1, flags);
}


@@ -1674,7 +1688,7 @@
* backward so that it now equals the insertion point.
*/
buffer_insert_string_1 (buf, (movepoint ? -1 : pos),
- newstr, Qnil, 0, newlen, 0);
+ newstr, Qnil, 0, newlen, -1, 0);
}
}

diff -r 94a6b8fbd56e src/insdel.h
--- a/src/insdel.h Tue Dec 17 20:49:52 2013 +0200
+++ b/src/insdel.h Thu Dec 19 10:47:01 2013 +0000
@@ -38,7 +38,7 @@
Charcount buffer_insert_string_1 (struct buffer *buf, Charbpos pos,
const Ibyte *nonreloc, Lisp_Object reloc,
Bytecount offset, Bytecount length,
- int flags);
+ Charcount clen, int flags);
Charcount buffer_insert_raw_string_1 (struct buffer *buf, Charbpos pos,
const Ibyte *nonreloc,
Bytecount length, int flags);
@@ -58,7 +58,7 @@
All of these can GC. */

#define buffer_insert_string(buf, nonreloc, reloc, offset, length) \
- buffer_insert_string_1 (buf, -1, nonreloc, reloc, offset, length, 0)
+ buffer_insert_string_1 (buf, -1, nonreloc, reloc, offset, length, -1, 0)
#define buffer_insert_raw_string(buf, string, length) \
buffer_insert_raw_string_1 (buf, -1, string, length, 0)
#define buffer_insert_ascstring(buf, s) \
diff -r 94a6b8fbd56e src/lstream.c
--- a/src/lstream.c Tue Dec 17 20:49:52 2013 +0200
+++ b/src/lstream.c Thu Dec 19 10:47:01 2013 +0000
@@ -735,6 +735,11 @@
return Lstream_read_1 (lstr, data, size, 0);
}

+Charcount
+Lstream_character_tell (Lstream *lstr)
+{
+ return lstr->imp->character_tell ? lstr->imp->character_tell (lstr) : -1;
+}

/* Push back SIZE bytes of DATA onto the input queue. The next call
to Lstream_read() with the same size will read the same bytes back.
diff -r 94a6b8fbd56e src/lstream.h
--- a/src/lstream.h Tue Dec 17 20:49:52 2013 +0200
+++ b/src/lstream.h Thu Dec 19 10:47:01 2013 +0000
@@ -181,6 +181,8 @@
method. If this method is not present, the result is determined
by whether a rewind method is present. */
int (*seekable_p) (Lstream *stream);
+
+ Charcount (*character_tell) (Lstream *stream);
/* Perform any additional operations necessary to flush the
data in this stream. */
int (*flusher) (Lstream *stream);
@@ -297,8 +299,8 @@
int Lstream_fputc (Lstream *lstr, int c);
int Lstream_fgetc (Lstream *lstr);
void Lstream_fungetc (Lstream *lstr, int c);
-Bytecount Lstream_read (Lstream *lstr, void *data,
- Bytecount size);
+Bytecount Lstream_read (Lstream *lstr, void *data, Bytecount size);
+Charcount Lstream_character_tell (Lstream *);
int Lstream_write (Lstream *lstr, const void *data,
Bytecount size);
int Lstream_was_blocked_p (Lstream *lstr);
diff -r 94a6b8fbd56e src/print.c
--- a/src/print.c Tue Dec 17 20:49:52 2013 +0200
+++ b/src/print.c Thu Dec 19 10:47:01 2013 +0000
@@ -514,7 +514,7 @@

buffer_insert_string_1 (XMARKER (function)->buffer,
spoint, nonreloc, reloc, offset, len,
- 0);
+ -1, 0);
Fset_marker (function, make_fixnum (spoint + cclen),
Fmarker_buffer (function));
}
diff -r 94a6b8fbd56e src/unicode.c
--- a/src/unicode.c Tue Dec 17 20:49:52 2013 +0200
+++ b/src/unicode.c Thu Dec 19 10:47:01 2013 +0000
@@ -1707,6 +1707,7 @@
unsigned char counter;
unsigned char indicated_length;
int seen_char;
+ Charcount characters_seen;
/* encode */
Lisp_Object current_charset;
int current_char_boundary;
@@ -1988,6 +1989,18 @@
write_error_characters_as_such);
}

+static Charcount
+unicode_character_tell (struct coding_stream *str)
+{
+#warning "examine the coding character mode too"n
+ if (CODING_STREAM_TYPE_DATA (str, unicode)->counter == 0)
+ {
+ return CODING_STREAM_TYPE_DATA (str, unicode)->characters_seen;
+ }
+
+ return -1;
+}
+
static Bytecount
unicode_convert (struct coding_stream *str, const UExtbyte *src,
unsigned_char_dynarr *dst, Bytecount n)
@@ -2006,6 +2019,7 @@
unsigned char counter = data->counter;
unsigned char indicated_length
= data->indicated_length;
+ Charcount characters_seen = data->characters_seen;

while (n--)
{
@@ -2020,12 +2034,15 @@
{
/* ASCII. */
decode_unicode_char (c, dst, data, ignore_bom);
+ characters_seen++;
}
else if (0 == (c & 0x40))
{
/* Highest bit set, second highest not--there's
something wrong. */
DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+ /* This is a character in the buffer. */
+ characters_seen++;
}
else if (0 == (c & 0x20))
{
@@ -2050,7 +2067,7 @@
/* We don't supports lengths longer than 4 in
external-format data. */
DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
-
+ characters_seen++;
}
}
else
@@ -2061,15 +2078,20 @@
indicate_invalid_utf_8(indicated_length,
counter,
ch, dst, data, ignore_bom);
+ /* These are characters our receiver will see, not
+ actual characters we've seen in the input. */
+ characters_seen += (indicated_length - counter);
if (c & 0x80)
{
DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+ characters_seen++;
}
else
{
/* The character just read is ASCII. Treat it as
such. */
decode_unicode_char (c, dst, data, ignore_bom);
+ characters_seen++;
}
ch = 0;
counter = 0;
@@ -2092,10 +2114,12 @@
counter,
ch, dst, data,
ignore_bom);
+ characters_seen += (indicated_length - counter);
}
else
{
decode_unicode_char (ch, dst, data, ignore_bom);
+ characters_seen++;
}
ch = 0;
}
@@ -2242,6 +2266,7 @@
indicate_invalid_utf_8(indicated_length,
counter, ch, dst, data,
ignore_bom);
+ characters_seen += (indicated_length - counter);
break;

case UNICODE_UTF_16:
@@ -2295,6 +2320,7 @@

data->counter = counter;
data->indicated_length = indicated_length;
+ data->characters_seen = characters_seen;
}
else
{
@@ -3177,6 +3203,8 @@
CODING_SYSTEM_HAS_METHOD (unicode, putprop);
CODING_SYSTEM_HAS_METHOD (unicode, getprop);

+ CODING_SYSTEM_HAS_METHOD (unicode, character_tell);
+
INITIALIZE_DETECTOR (utf_8);
DETECTOR_HAS_METHOD (utf_8, detect);
INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);
--
‘Liston operated so fast that he once accidentally amputated an assistant’s
fingers along with a patient’s leg, […] The patient and the assistant both
died of sepsis, and a spectator reportedly died of shock, resulting in the
only known procedure with a 300% mortality.’ (Atul Gawande, NEJM, 2012)
Aidan Kehoe
2014-01-16 17:10:35 UTC
Permalink
APPROVE COMMIT

SUPERSEDES ***@parhasard.net

This doesn’t include character_tell() implementations for lots of the coding
systems where said implementation is easy, e.g. in iso-8859-1. Nor does it
include it for chain coding systems, e.g. those for non-Unix line endings. I
haven’t profiled it to the level I did for the first patch, should do that
soon.

# HG changeset patch
# User Aidan Kehoe <***@parhasard.net>
# Date 1389889672 0
# Node ID 65d65b52d608ca1f17365a96fc3cf710a3af625c
# Parent 4004c3266c09888a9935242a462beb3fb28e02a3
Pass character count from coding systems to buffer insertion code.

src/ChangeLog addition:

2014-01-16 Aidan Kehoe <***@parhasard.net>

Pass character count information from the no-conversion and
unicode coding systems to the buffer insertion code, making
#'find-file on large buffers a little snappier (if
ERROR_CHECK_TEXT is not defined).

* file-coding.c:
* file-coding.c (coding_character_tell): New.
* file-coding.c (conversion_coding_stream_description): New.
* file-coding.c (no_conversion_convert):
Update characters_seen when decoding.
* file-coding.c (no_conversion_character_tell): New.
* file-coding.c (lstream_type_create_file_coding): Create the
no_conversion type with data.
* file-coding.c (coding_system_type_create):
Make the character_tell method available here.
* file-coding.h:
* file-coding.h (struct coding_system_methods):
Add a new character_tell() method, passing charcount information
from the coding systems to the buffer code, avoiding duplicate
bytecount-to-charcount work especially with large buffers.

* fileio.c (Finsert_file_contents_internal):
Update this to pass charcount information to
buffer_insert_string_1(), if that is available from the lstream code.

* insdel.c:
* insdel.c (buffer_insert_string_1):
Add a new CCLEN argument, giving the character count of the string
to insert. It can be -1 to indicate that te function should work
it out itself using bytecount_to_charcount(), as it used to.
* insdel.c (buffer_insert_raw_string_1):
* insdel.c (buffer_insert_lisp_string_1):
* insdel.c (buffer_insert_ascstring_1):
* insdel.c (buffer_insert_emacs_char_1):
* insdel.c (buffer_insert_from_buffer_1):
* insdel.c (buffer_replace_char):
Update these functions to use the new calling convention.
* insdel.h:
* insdel.h (buffer_insert_string):
Update this header to reflect the new buffer_insert_string_1()
argument.

* lstream.c (Lstream_character_tell): New.
Return the number of characters *read* and seen by the consumer so
far, taking into account the unget buffer, and buffered reading.

* lstream.c (Lstream_unread):
Update unget_character_count here as appropriate.
* lstream.c (Lstream_rewind):
Reset unget_character_count here too.

* lstream.h:
* lstream.h (struct lstream):
Provide the character_tell method, add a new field,
unget_character_count, giving the number of characters ever passed
to Lstream_unread().
Declare Lstream_character_tell().
Make Lstream_ungetc(), which happens to be unused, an inline
function rather than a macro, in the course of updating it to
modify unget_character_count.

* print.c (output_string):
Use the new argument to buffer_insert_string_1().
* tests.c:
* tests.c (Ftest_character_tell):
New test function.
* tests.c (syms_of_tests):
Make it available.
* unicode.c:
* unicode.c (struct unicode_coding_stream):
* unicode.c (unicode_character_tell):
New method.
* unicode.c (unicode_convert):
Update the character counter as appropriate.
* unicode.c (coding_system_type_create_unicode):
Make the character_tell method available.

diff -r 4004c3266c09 -r 65d65b52d608 src/ChangeLog
--- a/src/ChangeLog Sun Dec 22 10:36:33 2013 +0000
+++ b/src/ChangeLog Thu Jan 16 16:27:52 2014 +0000
@@ -1,3 +1,82 @@
+2014-01-16 Aidan Kehoe <***@parhasard.net>
+
+ Pass character count information from the no-conversion and
+ unicode coding systems to the buffer insertion code, making
+ #'find-file on large buffers a little snappier (if
+ ERROR_CHECK_TEXT is not defined).
+
+ * file-coding.c:
+ * file-coding.c (coding_character_tell): New.
+ * file-coding.c (conversion_coding_stream_description): New.
+ * file-coding.c (no_conversion_convert):
+ Update characters_seen when decoding.
+ * file-coding.c (no_conversion_character_tell): New.
+ * file-coding.c (lstream_type_create_file_coding): Create the
+ no_conversion type with data.
+ * file-coding.c (coding_system_type_create):
+ Make the character_tell method available here.
+ * file-coding.h:
+ * file-coding.h (struct coding_system_methods):
+ Add a new character_tell() method, passing charcount information
+ from the coding systems to the buffer code, avoiding duplicate
+ bytecount-to-charcount work especially with large buffers.
+
+ * fileio.c (Finsert_file_contents_internal):
+ Update this to pass charcount information to
+ buffer_insert_string_1(), if that is available from the lstream code.
+
+ * insdel.c:
+ * insdel.c (buffer_insert_string_1):
+ Add a new CCLEN argument, giving the character count of the string
+ to insert. It can be -1 to indicate that te function should work
+ it out itself using bytecount_to_charcount(), as it used to.
+ * insdel.c (buffer_insert_raw_string_1):
+ * insdel.c (buffer_insert_lisp_string_1):
+ * insdel.c (buffer_insert_ascstring_1):
+ * insdel.c (buffer_insert_emacs_char_1):
+ * insdel.c (buffer_insert_from_buffer_1):
+ * insdel.c (buffer_replace_char):
+ Update these functions to use the new calling convention.
+ * insdel.h:
+ * insdel.h (buffer_insert_string):
+ Update this header to reflect the new buffer_insert_string_1()
+ argument.
+
+ * lstream.c (Lstream_character_tell): New.
+ Return the number of characters *read* and seen by the consumer so
+ far, taking into account the unget buffer, and buffered reading.
+
+ * lstream.c (Lstream_unread):
+ Update unget_character_count here as appropriate.
+ * lstream.c (Lstream_rewind):
+ Reset unget_character_count here too.
+
+ * lstream.h:
+ * lstream.h (struct lstream):
+ Provide the character_tell method, add a new field,
+ unget_character_count, giving the number of characters ever passed
+ to Lstream_unread().
+ Declare Lstream_character_tell().
+ Make Lstream_ungetc(), which happens to be unused, an inline
+ function rather than a macro, in the course of updating it to
+ modify unget_character_count.
+
+ * print.c (output_string):
+ Use the new argument to buffer_insert_string_1().
+ * tests.c:
+ * tests.c (Ftest_character_tell):
+ New test function.
+ * tests.c (syms_of_tests):
+ Make it available.
+ * unicode.c:
+ * unicode.c (struct unicode_coding_stream):
+ * unicode.c (unicode_character_tell):
+ New method.
+ * unicode.c (unicode_convert):
+ Update the character counter as appropriate.
+ * unicode.c (coding_system_type_create_unicode):
+ Make the character_tell method available.
+
2013-12-19 Aidan Kehoe <***@parhasard.net>

* text.c:
diff -r 4004c3266c09 -r 65d65b52d608 src/file-coding.c
--- a/src/file-coding.c Sun Dec 22 10:36:33 2013 +0000
+++ b/src/file-coding.c Thu Jan 16 16:27:52 2014 +0000
@@ -1990,6 +1990,14 @@
return Lstream_seekable_p (str->other_end);
}

+static Charcount
+coding_character_tell (Lstream *stream)
+{
+ struct coding_stream *str = CODING_STREAM_DATA (stream);
+
+ return XCODESYSMETH_OR_GIVEN (str->codesys, character_tell, (str), -1);
+}
+
static int
coding_flusher (Lstream *stream)
{
@@ -2823,7 +2831,32 @@

#### Shouldn't we _call_ it that, then? And while we're at it,
separate it into "to_internal" and "to_external"? */
-DEFINE_CODING_SYSTEM_TYPE (no_conversion);
+
+
+struct no_conversion_coding_system
+{
+};
+
+struct no_conversion_coding_stream
+{
+ /* Number of characters seen when decoding. */
+ Charcount characters_seen;
+};
+
+static const struct memory_description no_conversion_coding_system_description[] = {
+ { XD_END }
+};
+
+static const struct memory_description no_conversion_coding_stream_description_1 [] = {
+ { XD_INT, offsetof (struct no_conversion_coding_stream, characters_seen) },
+ { XD_END }
+};
+
+const struct sized_memory_description no_conversion_coding_stream_description = {
+ sizeof (struct no_conversion_coding_stream), no_conversion_coding_stream_description_1
+};
+
+DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (no_conversion);

/* This is used when reading in "binary" files -- i.e. files that may
contain all 256 possible byte values and that are not to be
@@ -2846,6 +2879,9 @@
DECODE_ADD_BINARY_CHAR (c, dst);
}

+ CODING_STREAM_TYPE_DATA (str, no_conversion)->characters_seen
+ += orign;
+
if (str->eof)
DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
}
@@ -2904,6 +2940,12 @@
return orign;
}

+static Charcount
+no_conversion_character_tell (struct coding_stream *str)
+{
+ return CODING_STREAM_TYPE_DATA (str, no_conversion)->characters_seen;
+}
+
DEFINE_DETECTOR (no_conversion);
DEFINE_DETECTOR_CATEGORY (no_conversion, no_conversion);

@@ -4656,6 +4698,7 @@
LSTREAM_HAS_METHOD (coding, writer);
LSTREAM_HAS_METHOD (coding, rewinder);
LSTREAM_HAS_METHOD (coding, seekable_p);
+ LSTREAM_HAS_METHOD (coding, character_tell);
LSTREAM_HAS_METHOD (coding, marker);
LSTREAM_HAS_METHOD (coding, flusher);
LSTREAM_HAS_METHOD (coding, closer);
@@ -4697,9 +4740,10 @@
dump_add_opaque_int (&coding_detector_count);
dump_add_opaque_int (&coding_detector_category_count);

- INITIALIZE_CODING_SYSTEM_TYPE (no_conversion,
- "no-conversion-coding-system-p");
+ INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (no_conversion,
+ "no-conversion-coding-system-p");
CODING_SYSTEM_HAS_METHOD (no_conversion, convert);
+ CODING_SYSTEM_HAS_METHOD (no_conversion, character_tell);

INITIALIZE_DETECTOR (no_conversion);
DETECTOR_HAS_METHOD (no_conversion, detect);
diff -r 4004c3266c09 -r 65d65b52d608 src/file-coding.h
--- a/src/file-coding.h Sun Dec 22 10:36:33 2013 +0000
+++ b/src/file-coding.h Thu Jan 16 16:27:52 2014 +0000
@@ -353,6 +353,9 @@
a result of the stream being rewound. Optional. */
void (*rewind_coding_stream_method) (struct coding_stream *str);

+ /* Return the number of characters *decoded*. Optional. */
+ Charcount (*character_tell_method) (struct coding_stream *str);
+
/* Finalize coding stream method: Clean up the type-specific data
attached to the coding stream (i.e. in struct TYPE_coding_stream).
Happens when the Lstream is deleted using Lstream_delete() or is
diff -r 4004c3266c09 -r 65d65b52d608 src/fileio.c
--- a/src/fileio.c Sun Dec 22 10:36:33 2013 +0000
+++ b/src/fileio.c Thu Jan 16 16:27:52 2014 +0000
@@ -3180,6 +3180,7 @@
struct gcpro ngcpro1;
Lisp_Object stream = make_filedesc_input_stream (fd, 0, total,
LSTR_ALLOW_QUIT);
+ Charcount last_tell = -1;

NGCPRO1 (stream);
Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
@@ -3187,6 +3188,7 @@
(XLSTREAM (stream), get_coding_system_for_text_file (codesys, 1),
CODING_DECODE, 0);
Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+ last_tell = Lstream_character_tell (XLSTREAM (stream));

record_unwind_protect (delete_stream_unwind, stream);

@@ -3196,7 +3198,7 @@
while (1)
{
Bytecount this_len;
- Charcount cc_inserted;
+ Charcount cc_inserted, this_tell = last_tell;

QUIT;
this_len = Lstream_read (XLSTREAM (stream), read_buf,
@@ -3209,12 +3211,17 @@
break;
}

- cc_inserted = buffer_insert_raw_string_1 (buf, cur_point, read_buf,
- this_len,
- !NILP (visit)
- ? INSDEL_NO_LOCKING : 0);
+ cc_inserted
+ = buffer_insert_string_1 (buf, cur_point, read_buf, Qnil,
+ 0, this_len, last_tell >= 0
+ ? (this_tell
+ = Lstream_character_tell (XLSTREAM
+ (stream)))
+ - last_tell : -1,
+ !NILP (visit) ? INSDEL_NO_LOCKING : 0);
inserted += cc_inserted;
cur_point += cc_inserted;
+ last_tell = this_tell;
}
if (!NILP (used_codesys))
{
diff -r 4004c3266c09 -r 65d65b52d608 src/insdel.c
--- a/src/insdel.c Sun Dec 22 10:36:33 2013 +0000
+++ b/src/insdel.c Thu Jan 16 16:27:52 2014 +0000
@@ -1039,14 +1039,15 @@
#endif
}

-/* Insert a string into BUF at Charbpos POS. The string data comes
- from one of two sources: constant, non-relocatable data (specified
- in NONRELOC), or a Lisp string object (specified in RELOC), which
- is relocatable and may have extent data that needs to be copied
- into the buffer. OFFSET and LENGTH specify the substring of the
- data that is actually to be inserted. As a special case, if POS
- is -1, insert the string at point and move point to the end of the
- string.
+/* Insert a string into BUF at Charbpos POS. The string data comes from one
+ of two sources: constant, non-relocatable data (specified in NONRELOC),
+ or a Lisp string object (specified in RELOC), which is relocatable and
+ may have extent data that needs to be copied into the buffer. OFFSET and
+ LENGTH specify the substring of the data that is actually to be inserted.
+ As a special case, if POS is -1, insert the string at point and move
+ point to the end of the string. CCLEN is the character count of the data
+ to be inserted, and can be -1 to indicate that buffer_insert_string_1 ()
+ should work this out itself with bytecount_to_charcount().

Normally, markers at the insertion point end up before the
inserted string. If INSDEL_BEFORE_MARKERS is set in flags, however,
@@ -1061,13 +1062,12 @@
buffer_insert_string_1 (struct buffer *buf, Charbpos pos,
const Ibyte *nonreloc, Lisp_Object reloc,
Bytecount offset, Bytecount length,
- int flags)
+ Charcount cclen, int flags)
{
/* This function can GC */
struct gcpro gcpro1;
Bytebpos bytepos;
Bytecount length_in_buffer;
- Charcount cclen;
int move_point = 0;
struct buffer *mbuf;
Lisp_Object bufcons;
@@ -1118,14 +1118,27 @@

bytepos = charbpos_to_bytebpos (buf, pos);

- /* string may have been relocated up to this point */
- if (STRINGP (reloc))
+ if (cclen < 0)
{
- cclen = string_offset_byte_to_char_len (reloc, offset, length);
- nonreloc = XSTRING_DATA (reloc);
+ /* string may have been relocated up to this point */
+ if (STRINGP (reloc))
+ {
+ cclen = string_offset_byte_to_char_len (reloc, offset, length);
+ nonreloc = XSTRING_DATA (reloc);
+ }
+ else
+ cclen = bytecount_to_charcount (nonreloc + offset, length);
}
else
- cclen = bytecount_to_charcount (nonreloc + offset, length);
+ {
+ text_checking_assert (cclen > 0 && cclen
+ == (STRINGP (reloc) ?
+ string_offset_byte_to_char_len (reloc, offset,
+ length)
+ : bytecount_to_charcount (nonreloc + offset,
+ length)));
+ }
+
/* &&#### Here we check if the text can't fit into the format of the buffer,
and if so convert it to another format (either default or 32-bit-fixed,
according to some flag; if no flag, use default). */
@@ -1286,7 +1299,7 @@
{
/* This function can GC */
return buffer_insert_string_1 (buf, pos, nonreloc, Qnil, 0, length,
- flags);
+ -1, flags);
}

Charcount
@@ -1295,8 +1308,7 @@
{
/* This function can GC */
return buffer_insert_string_1 (buf, pos, 0, str, 0,
- XSTRING_LENGTH (str),
- flags);
+ XSTRING_LENGTH (str), -1, flags);
}

/* Insert the null-terminated string S (in external format). */
@@ -1309,7 +1321,7 @@
const CIbyte *translated = GETTEXT (s);
ASSERT_ASCTEXT_ASCII (s);
return buffer_insert_string_1 (buf, pos, (const Ibyte *) translated, Qnil,
- 0, strlen (translated), flags);
+ 0, strlen (translated), -1, flags);
}

Charcount
@@ -1319,7 +1331,7 @@
/* This function can GC */
Ibyte str[MAX_ICHAR_LEN];
Bytecount len = set_itext_ichar (str, ch);
- return buffer_insert_string_1 (buf, pos, str, Qnil, 0, len, flags);
+ return buffer_insert_string_1 (buf, pos, str, Qnil, 0, len, -1, flags);
}

Charcount
@@ -1339,7 +1351,7 @@
/* This function can GC */
Lisp_Object str = make_string_from_buffer (buf2, pos2, length);
return buffer_insert_string_1 (buf, pos, 0, str, 0,
- XSTRING_LENGTH (str), flags);
+ XSTRING_LENGTH (str), -1, flags);
}


@@ -1674,7 +1686,7 @@
* backward so that it now equals the insertion point.
*/
buffer_insert_string_1 (buf, (movepoint ? -1 : pos),
- newstr, Qnil, 0, newlen, 0);
+ newstr, Qnil, 0, newlen, -1, 0);
}
}

diff -r 4004c3266c09 -r 65d65b52d608 src/insdel.h
--- a/src/insdel.h Sun Dec 22 10:36:33 2013 +0000
+++ b/src/insdel.h Thu Jan 16 16:27:52 2014 +0000
@@ -38,7 +38,7 @@
Charcount buffer_insert_string_1 (struct buffer *buf, Charbpos pos,
const Ibyte *nonreloc, Lisp_Object reloc,
Bytecount offset, Bytecount length,
- int flags);
+ Charcount clen, int flags);
Charcount buffer_insert_raw_string_1 (struct buffer *buf, Charbpos pos,
const Ibyte *nonreloc,
Bytecount length, int flags);
@@ -58,7 +58,7 @@
All of these can GC. */

#define buffer_insert_string(buf, nonreloc, reloc, offset, length) \
- buffer_insert_string_1 (buf, -1, nonreloc, reloc, offset, length, 0)
+ buffer_insert_string_1 (buf, -1, nonreloc, reloc, offset, length, -1, 0)
#define buffer_insert_raw_string(buf, string, length) \
buffer_insert_raw_string_1 (buf, -1, string, length, 0)
#define buffer_insert_ascstring(buf, s) \
diff -r 4004c3266c09 -r 65d65b52d608 src/lstream.c
--- a/src/lstream.c Sun Dec 22 10:36:33 2013 +0000
+++ b/src/lstream.c Thu Jan 16 16:27:52 2014 +0000
@@ -735,6 +735,134 @@
return Lstream_read_1 (lstr, data, size, 0);
}

+Charcount
+Lstream_character_tell (Lstream *lstr)
+{
+ Charcount ctell = lstr->imp->character_tell ?
+ lstr->imp->character_tell (lstr) : -1;
+
+ if (ctell >= 0)
+ {
+ /* Our implementation's character tell code doesn't know about the
+ unget buffer, update its figure to reflect it. */
+ ctell += lstr->unget_character_count;
+
+ if (lstr->unget_buffer_ind > 0)
+ {
+ /* The character count should not include those characters
+ currently *in* the unget buffer, subtract that count. */
+ Ibyte *ungot, *ungot_ptr;
+ Bytecount ii = lstr->unget_buffer_ind, impartial, sevenflen;
+
+ ungot_ptr = ungot
+ = alloca_ibytes (lstr->unget_buffer_ind) + MAX_ICHAR_LEN;
+
+ /* Make sure the string starts with a valid ibyteptr, otherwise
+ validate_ibyte_string_backward could run off the beginning. */
+ sevenflen = set_itext_ichar (ungot, (Ichar) 0x7f);
+ ungot_ptr += sevenflen;
+
+ /* Internal format data, but in reverse order. There's not
+ actually a need to alloca here, we could work out the character
+ count directly from the reversed bytes, but the alloca approach
+ is more robust to changes in our internal format, and the unget
+ buffer is not going to blow the stack. */
+ while (ii > 0)
+ {
+ *ungot_ptr++ = lstr->unget_buffer[--ii];
+ }
+
+ impartial
+ = validate_ibyte_string_backward (ungot, ungot_ptr - ungot);
+
+ /* Move past the character we added. */
+ impartial -= sevenflen;
+ INC_IBYTEPTR (ungot);
+
+ if (impartial > 0 && !valid_ibyteptr_p (ungot))
+ {
+ Ibyte *newstart = ungot, *limit = ungot + impartial;
+ /* Our consumer has the start of a partial character, we
+ have the rest. */
+
+ while (!valid_ibyteptr_p (newstart) && newstart < limit)
+ {
+ newstart++, impartial--;
+ }
+
+ /* Remove this character from the count, since the
+ end-consumer hasn't seen the full character. */
+ ctell--;
+ ungot = newstart;
+ }
+ else if (valid_ibyteptr_p (ungot)
+ && rep_bytes_by_first_byte (*ungot) > impartial)
+ {
+ /* Rest of a partial character has yet to be read, its first
+ octet has probably been unread by Lstream_read_1(). We
+ included it in the accounting in Lstream_unread(), adjust
+ the figure here appropriately. */
+ ctell--;
+ }
+
+ /* bytecount_to_charcount will throw an assertion failure if we're
+ not at the start of a character. */
+ text_checking_assert (impartial == 0 || valid_ibyteptr_p (ungot));
+
+ /* The character length of this text is included in
+ unget_character_count; if the bytes are still in the unget
+ buffer, then our consumers haven't seen them, and so the
+ character tell figure shouldn't reflect them. Subtract it from
+ the total. */
+ ctell -= bytecount_to_charcount (ungot, impartial);
+ }
+
+ if (lstr->in_buffer_ind < lstr->in_buffer_current)
+ {
+ Ibyte *inbuf = lstr->in_buffer + lstr->in_buffer_ind;
+ Bytecount partial = lstr->in_buffer_current - lstr->in_buffer_ind,
+ impartial;
+
+ if (!valid_ibyteptr_p (inbuf))
+ {
+ Ibyte *newstart = inbuf;
+ Ibyte *limit = lstr->in_buffer + lstr->in_buffer_current;
+ /* Our consumer has the start of a partial character, we
+ have the rest. */
+
+ while (newstart < limit && !valid_ibyteptr_p (newstart))
+ {
+ newstart++;
+ }
+
+ /* Remove this character from the count, since the
+ end-consumer hasn't seen the full character. */
+ ctell--;
+ inbuf = newstart;
+ partial = limit - newstart;
+ }
+
+ if (valid_ibyteptr_p (inbuf))
+ {
+ /* There's at least one valid starting char in the string,
+ validate_ibyte_string_backward won't run off the
+ begining. */
+ impartial =
+ validate_ibyte_string_backward (inbuf, partial);
+ }
+ else
+ {
+ impartial = 0;
+ }
+
+ ctell -= bytecount_to_charcount (inbuf, impartial);
+ }
+
+ text_checking_assert (ctell >= 0);
+ }
+
+ return ctell;
+}

/* Push back SIZE bytes of DATA onto the input queue. The next call
to Lstream_read() with the same size will read the same bytes back.
@@ -755,7 +883,12 @@
/* Bytes have to go on in reverse order -- they are reversed
again when read back. */
while (size--)
- lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size];
+ {
+ lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size];
+ /* If we see a valid first byte, that is the last octet in a
+ character, so increase the count of ungot characters. */
+ lstr->unget_character_count += valid_ibyteptr_p (p + size);
+ }
}

/* Rewind the stream to the beginning. */
@@ -768,6 +901,7 @@
if (Lstream_flush (lstr) < 0)
return -1;
lstr->byte_count = 0;
+ lstr->unget_character_count = 0;
return (lstr->imp->rewinder) (lstr);
}

diff -r 4004c3266c09 -r 65d65b52d608 src/lstream.h
--- a/src/lstream.h Sun Dec 22 10:36:33 2013 +0000
+++ b/src/lstream.h Thu Jan 16 16:27:52 2014 +0000
@@ -181,6 +181,10 @@
method. If this method is not present, the result is determined
by whether a rewind method is present. */
int (*seekable_p) (Lstream *stream);
+
+ /* Return the number of complete characters read so far. Respects
+ buffering and unget. Returns -1 if unknown or not implemented. */
+ Charcount (*character_tell) (Lstream *stream);
/* Perform any additional operations necessary to flush the
data in this stream. */
int (*flusher) (Lstream *stream);
@@ -250,8 +254,9 @@
similarly has to push the data on backwards. */
unsigned char *unget_buffer; /* holds characters pushed back onto input */
Bytecount unget_buffer_size; /* allocated size of buffer */
- Bytecount unget_buffer_ind; /* pointer to next buffer spot
- to write a character */
+ Bytecount unget_buffer_ind; /* Next buffer spot to write a character */
+
+ Charcount unget_character_count; /* Count of complete characters ever ungot. */

Bytecount byte_count;
int flags;
@@ -297,8 +302,8 @@
int Lstream_fputc (Lstream *lstr, int c);
int Lstream_fgetc (Lstream *lstr);
void Lstream_fungetc (Lstream *lstr, int c);
-Bytecount Lstream_read (Lstream *lstr, void *data,
- Bytecount size);
+Bytecount Lstream_read (Lstream *lstr, void *data, Bytecount size);
+Charcount Lstream_character_tell (Lstream *);
int Lstream_write (Lstream *lstr, const void *data,
Bytecount size);
int Lstream_was_blocked_p (Lstream *lstr);
@@ -353,19 +358,28 @@
reverse order they were pushed back -- most recent first. (This is
necessary for consistency -- if there are a number of bytes that
have been unread and I read and unread a byte, it needs to be the
- first to be read again.) This is a macro and so it is very
- efficient. The C argument is only evaluated once but the STREAM
- argument is evaluated more than once.
- */
+ first to be read again.) */

-#define Lstream_ungetc(stream, c) \
-/* Add to the end if it won't overflow buffer; otherwise call the \
- function equivalent */ \
- ((stream)->unget_buffer_ind >= (stream)->unget_buffer_size ? \
- Lstream_fungetc (stream, c) : \
- (void) ((stream)->byte_count--, \
- ((stream)->unget_buffer[(stream)->unget_buffer_ind++] = \
- (unsigned char) (c))))
+DECLARE_INLINE_HEADER (
+void
+Lstream_ungetc (Lstream *lstr, int c)
+)
+{
+ /* Add to the end if it won't overflow buffer; otherwise call the
+ function equivalent */
+ if (lstr->unget_buffer_ind >= lstr->unget_buffer_size)
+ {
+ Lstream_fungetc (lstr, c);
+ }
+ else
+ {
+ lstr->byte_count--;
+ lstr->unget_buffer[lstr->unget_buffer_ind] = (unsigned char) (c);
+ lstr->unget_character_count
+ += valid_ibyteptr_p (lstr->unget_buffer + lstr->unget_buffer_ind);
+ lstr->unget_buffer_ind++;
+ }
+}

#define Lstream_data(stream) ((void *) ((stream)->data))
#define Lstream_byte_count(stream) ((stream)->byte_count)
diff -r 4004c3266c09 -r 65d65b52d608 src/print.c
--- a/src/print.c Sun Dec 22 10:36:33 2013 +0000
+++ b/src/print.c Thu Jan 16 16:27:52 2014 +0000
@@ -514,7 +514,7 @@

buffer_insert_string_1 (XMARKER (function)->buffer,
spoint, nonreloc, reloc, offset, len,
- 0);
+ -1, 0);
Fset_marker (function, make_fixnum (spoint + cclen),
Fmarker_buffer (function));
}
diff -r 4004c3266c09 -r 65d65b52d608 src/tests.c
--- a/src/tests.c Sun Dec 22 10:36:33 2013 +0000
+++ b/src/tests.c Thu Jan 16 16:27:52 2014 +0000
@@ -558,6 +558,186 @@
return conversion_result;
}

+DEFUN ("test-character-tell", Ftest_character_tell, 0, 0, "", /*
+Return list of results of tests of the stream character offset code.
+For use by the automated test suite. See tests/automated/c-tests.
+
+Each element is a list (DESCRIPTION, STATUS, REASON).
+DESCRIPTION is a string describing the test.
+STATUS is a symbol, either t (pass) or nil (fail).
+REASON is nil or a string describing the failure (not required).
+*/
+ ())
+{
+ Extbyte ext_unix[]= "\n\nfoo\nbar\n\nf\372b\343\340\nfoo\nbar\n";
+ /* Previous string in UTF-8. */
+ Extbyte ext_utf_8_unix[]
+ = "\n\nfoo\nbar\n\nf\303\272b\303\243\303\240\nfoo\nbar\n";
+ Charcount ext_utf_8_unix_char_len = 25;
+ Ibyte shortbuf[13], longbuf[512];
+ Lisp_Object stream =
+ make_fixed_buffer_input_stream (ext_unix, sizeof (ext_unix) - 1);
+ Lisp_Object result = Qnil, string = Qnil;
+ Charcount count;
+ Bytecount bytecount;
+ struct gcpro gcpro1, gcpro2, gcpro3;
+
+#define CHARACTER_TELL_ASSERT(assertion, description, failing_case) \
+ do \
+ { \
+ if (assertion) \
+ result = Fcons (list3 (build_cistring (description), \
+ Qt, Qnil), result); \
+ else \
+ result = Fcons (list3 (build_cistring (description), \
+ Qnil, build_ascstring (failing_case)), \
+ result); \
+ } \
+ while (0)
+
+ GCPRO3 (stream, result, string);
+
+ Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+ stream = make_coding_input_stream
+ (XLSTREAM (stream), Ffind_coding_system (intern ("no-conversion-unix")),
+ CODING_DECODE, 0);
+ Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+
+ bytecount = Lstream_read (XLSTREAM (stream), longbuf, sizeof (longbuf));
+
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ == sizeof (ext_unix) -1,
+ "basic character tell, no-conversion-unix",
+ "basic character tell failed");
+
+ string = build_extstring (ext_unix,
+ Ffind_coding_system (intern
+ ("no-conversion-unix")));
+
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ == string_char_length (string),
+ "repeat basic character tell, no-conversion-unix",
+ "repeat basic character tell failed with string");
+
+ count = Lstream_character_tell (XLSTREAM (stream));
+
+ Lstream_unread (XLSTREAM (stream), "r\n", 2);
+
+ /* This should give the same result as before the unread. */
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ == count, "checking post-unread character tell",
+ "post-unread character tell failed");
+ bytecount += Lstream_read (XLSTREAM (stream), longbuf + bytecount,
+ sizeof (longbuf) - bytecount);
+
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ == count + 2,
+ "checking post-unread+read character tell",
+ "post-unread+read character tell failed");
+
+ /* This seems to be buggy for my purposes. */
+ /* Lstream_rewind (XLSTREAM (stream)); */
+ Lstream_close (XLSTREAM (stream));
+ Lstream_delete (XLSTREAM (stream));
+
+ stream = make_fixed_buffer_input_stream (ext_unix, sizeof (ext_unix) - 1);
+ Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+ Lstream_unset_character_mode (XLSTREAM (stream));
+ stream = make_coding_input_stream
+ (XLSTREAM (stream), Ffind_coding_system (intern ("no-conversion-unix")),
+ CODING_DECODE, 0);
+ Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+ Lstream_unset_character_mode (XLSTREAM (stream));
+
+ bytecount = Lstream_read (XLSTREAM (stream), shortbuf, sizeof (shortbuf));
+
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ /* This should be equal to sizeof (shortbuf) on
+ non-mule. */
+ == sizeof (shortbuf) - !(byte_ascii_p (0xff)),
+ "character tell with short read, no-conversion-unix",
+ "short read character tell failed");
+
+ Lstream_close (XLSTREAM (stream));
+ Lstream_delete (XLSTREAM (stream));
+
+ stream
+ = make_fixed_buffer_input_stream (ext_utf_8_unix,
+ sizeof (ext_utf_8_unix) - 1);
+ Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+ stream = make_coding_input_stream
+ (XLSTREAM (stream), Ffind_coding_system (intern ("utf-8-unix")),
+ CODING_DECODE, 0);
+ Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+
+ bytecount = Lstream_read (XLSTREAM (stream), longbuf, sizeof (longbuf));
+
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ == ext_utf_8_unix_char_len,
+ "utf-8 character tell, utf-8-unix",
+ "utf-8 character tell failed");
+
+ string = build_extstring (ext_utf_8_unix,
+ Ffind_coding_system (intern
+ ("utf-8-unix")));
+
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ == string_char_length (string),
+ "repeat utf-8 character tell, utf-8-unix",
+ "repeat utf-8 character tell failed with string");
+
+ count = Lstream_character_tell (XLSTREAM (stream));
+
+ Lstream_unread (XLSTREAM (stream), "r\n", 2);
+
+ /* This should give the same result as before the unread. */
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ == count, "checking post-unread utf-8 tell",
+ "post-unread utf-8 tell failed");
+ bytecount += Lstream_read (XLSTREAM (stream), longbuf + bytecount,
+ sizeof (longbuf) - bytecount);
+
+ CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream))
+ == count + 2,
+ "checking post-unread+read utf-8 tell",
+ "post-unread+read utf-8 tell failed");
+
+ /* This seems to be buggy for my purposes. */
+ /* Lstream_rewind (XLSTREAM (stream)); */
+ Lstream_close (XLSTREAM (stream));
+ Lstream_delete (XLSTREAM (stream));
+
+ stream = make_fixed_buffer_input_stream (ext_utf_8_unix, sizeof (ext_utf_8_unix) - 1);
+ Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+ Lstream_set_character_mode (XLSTREAM (stream));
+
+ stream = make_coding_input_stream
+ (XLSTREAM (stream), Ffind_coding_system (intern ("utf-8-unix")),
+ CODING_DECODE, 0);
+ Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536);
+ Lstream_set_character_mode (XLSTREAM (stream));
+
+ bytecount = Lstream_read (XLSTREAM (stream), shortbuf, sizeof (shortbuf));
+
+ CHARACTER_TELL_ASSERT
+ (bytecount == (sizeof (shortbuf) - 1),
+ "utf-8 Lstream_read, character mode, checking partial char not read",
+ "partial char appars to have been read when it shouldn't");
+
+ CHARACTER_TELL_ASSERT
+ (Lstream_character_tell (XLSTREAM (stream))
+ /* This is shorter, because it's in the middle of a character. */
+ == sizeof (shortbuf) - 1,
+ "utf-8 tell with short read, character mode, utf-8-unix",
+ "utf-8 read character tell, character mode failed");
+
+ Lstream_close (XLSTREAM (stream));
+ Lstream_delete (XLSTREAM (stream));
+
+ UNGCPRO;
+ return result;
+}
+

/* Hash Table testing */

@@ -724,6 +904,7 @@
Vtest_function_list = Qnil;

TESTS_DEFSUBR (Ftest_data_format_conversion);
+ TESTS_DEFSUBR (Ftest_character_tell);
TESTS_DEFSUBR (Ftest_hash_tables);
TESTS_DEFSUBR (Ftest_store_void_in_lisp);
/* Add other test functions here with TESTS_DEFSUBR */
diff -r 4004c3266c09 -r 65d65b52d608 src/unicode.c
--- a/src/unicode.c Sun Dec 22 10:36:33 2013 +0000
+++ b/src/unicode.c Thu Jan 16 16:27:52 2014 +0000
@@ -1707,6 +1707,7 @@
unsigned char counter;
unsigned char indicated_length;
int seen_char;
+ Charcount characters_seen;
/* encode */
Lisp_Object current_charset;
int current_char_boundary;
@@ -1988,6 +1989,17 @@
write_error_characters_as_such);
}

+static Charcount
+unicode_character_tell (struct coding_stream *str)
+{
+ if (CODING_STREAM_TYPE_DATA (str, unicode)->counter == 0)
+ {
+ return CODING_STREAM_TYPE_DATA (str, unicode)->characters_seen;
+ }
+
+ return -1;
+}
+
static Bytecount
unicode_convert (struct coding_stream *str, const UExtbyte *src,
unsigned_char_dynarr *dst, Bytecount n)
@@ -2006,6 +2018,7 @@
unsigned char counter = data->counter;
unsigned char indicated_length
= data->indicated_length;
+ Charcount characters_seen = data->characters_seen;

while (n--)
{
@@ -2020,12 +2033,15 @@
{
/* ASCII. */
decode_unicode_char (c, dst, data, ignore_bom);
+ characters_seen++;
}
else if (0 == (c & 0x40))
{
/* Highest bit set, second highest not--there's
something wrong. */
DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+ /* This is a character in the buffer. */
+ characters_seen++;
}
else if (0 == (c & 0x20))
{
@@ -2050,7 +2066,7 @@
/* We don't supports lengths longer than 4 in
external-format data. */
DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
-
+ characters_seen++;
}
}
else
@@ -2061,15 +2077,20 @@
indicate_invalid_utf_8(indicated_length,
counter,
ch, dst, data, ignore_bom);
+ /* These are characters our receiver will see, not
+ actual characters we've seen in the input. */
+ characters_seen += (indicated_length - counter);
if (c & 0x80)
{
DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+ characters_seen++;
}
else
{
/* The character just read is ASCII. Treat it as
such. */
decode_unicode_char (c, dst, data, ignore_bom);
+ characters_seen++;
}
ch = 0;
counter = 0;
@@ -2092,10 +2113,12 @@
counter,
ch, dst, data,
ignore_bom);
+ characters_seen += (indicated_length - counter);
}
else
{
decode_unicode_char (ch, dst, data, ignore_bom);
+ characters_seen++;
}
ch = 0;
}
@@ -2242,6 +2265,7 @@
indicate_invalid_utf_8(indicated_length,
counter, ch, dst, data,
ignore_bom);
+ characters_seen += (indicated_length - counter);
break;

case UNICODE_UTF_16:
@@ -2295,6 +2319,7 @@

data->counter = counter;
data->indicated_length = indicated_length;
+ data->characters_seen = characters_seen;
}
else
{
@@ -3177,6 +3202,8 @@
CODING_SYSTEM_HAS_METHOD (unicode, putprop);
CODING_SYSTEM_HAS_METHOD (unicode, getprop);

+ CODING_SYSTEM_HAS_METHOD (unicode, character_tell);
+
INITIALIZE_DETECTOR (utf_8);
DETECTOR_HAS_METHOD (utf_8, detect);
INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);
--
‘Liston operated so fast that he once accidentally amputated an assistant’s
fingers along with a patient’s leg, […] The patient and the assistant both
died of sepsis, and a spectator reportedly died of shock, resulting in the
only known procedure with a 300% mortality.’ (Atul Gawande, NEJM, 2012)
Loading...