View Javadoc
1   /*
2   Copyright (c) 2008 Health Market Science, Inc.
3   
4   Licensed under the Apache License, Version 2.0 (the "License");
5   you may not use this file except in compliance with the License.
6   You may obtain a copy of the License at
7   
8       http://www.apache.org/licenses/LICENSE-2.0
9   
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15  */
16  
17  package com.healthmarketscience.jackcess.impl;
18  
19  import java.io.BufferedReader;
20  import java.io.IOException;
21  import java.io.InputStreamReader;
22  import java.util.Arrays;
23  import java.util.HashMap;
24  import java.util.Map;
25  
26  import static com.healthmarketscience.jackcess.impl.ByteUtil.ByteStream;
27  
28  /**
29   * Various constants used for creating "general legacy" (access 2000-2007)
30   * sort order text index entries.
31   *
32   * @author James Ahlborn
33   */
34  public class GeneralLegacyIndexCodes {
35  
36    static final int MAX_TEXT_INDEX_CHAR_LENGTH =
37      (JetFormat.TEXT_FIELD_MAX_LENGTH / JetFormat.TEXT_FIELD_UNIT_SIZE);
38  
39    static final byte END_TEXT = (byte)0x01;
40    static final byte END_EXTRA_TEXT = (byte)0x00;
41  
42    // unprintable char is removed from normal text.
43    // pattern for unprintable chars in the extra bytes:
44    // 01 01 01 <pos> 06  <code> )
45    // <pos> = 7 + (4 * char_pos) | 0x8000 (as short)
46    // <code> = char code
47    static final int UNPRINTABLE_COUNT_START = 7;
48    static final int UNPRINTABLE_COUNT_MULTIPLIER = 4;
49    static final int UNPRINTABLE_OFFSET_FLAGS = 0x8000;
50    static final byte UNPRINTABLE_MIDFIX = (byte)0x06;
51  
52    // international char is replaced with ascii char.
53    // pattern for international chars in the extra bytes:
54    // [ 02 (for each normal char) ] [ <symbol_code> (for each inat char) ]
55    static final byte INTERNATIONAL_EXTRA_PLACEHOLDER = (byte)0x02;
56  
57    // see Index.writeCrazyCodes for details on writing crazy codes
58    static final byte CRAZY_CODE_START = (byte)0x80;
59    static final byte CRAZY_CODE_1 = (byte)0x02;
60    static final byte CRAZY_CODE_2 = (byte)0x03;
61    static final byte[] CRAZY_CODES_SUFFIX =
62      new byte[]{(byte)0xFF, (byte)0x02, (byte)0x80, (byte)0xFF, (byte)0x80};
63    static final byte CRAZY_CODES_UNPRINT_SUFFIX = (byte)0xFF;
64  
65    // stash the codes in some resource files
66    private static final String CODES_FILE =
67      DatabaseImpl.RESOURCE_PATH + "index_codes_genleg.txt";
68    private static final String EXT_CODES_FILE =
69      DatabaseImpl.RESOURCE_PATH + "index_codes_ext_genleg.txt";
70  
71    /**
72     * Enum which classifies the types of char encoding strategies used when
73     * creating text index entries.
74     */
75    enum Type {
76      SIMPLE("S") {
77        @Override public CharHandler parseCodes(String[] codeStrings) {
78          return parseSimpleCodes(codeStrings);
79        }
80      },
81      INTERNATIONAL("I") {
82        @Override public CharHandler parseCodes(String[] codeStrings) {
83          return parseInternationalCodes(codeStrings);
84        }
85      },
86      UNPRINTABLE("U") {
87        @Override public CharHandler parseCodes(String[] codeStrings) {
88          return parseUnprintableCodes(codeStrings);
89        }
90      },
91      UNPRINTABLE_EXT("P") {
92        @Override public CharHandler parseCodes(String[] codeStrings) {
93          return parseUnprintableExtCodes(codeStrings);
94        }
95      },
96      INTERNATIONAL_EXT("Z") {
97        @Override public CharHandler parseCodes(String[] codeStrings) {
98          return parseInternationalExtCodes(codeStrings);
99        }
100     },
101     SIGNIFICANT("G") {
102       @Override public CharHandler parseCodes(String[] codeStrings) {
103         return parseSignificantCodes(codeStrings);
104       }
105     },
106     SURROGATE("Q") {
107       @Override public CharHandler parseCodes(String[] codeStrings) {
108         // these are not parsed from the codes files
109         throw new UnsupportedOperationException();
110       }
111     },
112     IGNORED("X") {
113       @Override public CharHandler parseCodes(String[] codeStrings) {
114         return IGNORED_CHAR_HANDLER;
115       }
116     };
117 
118     private final String _prefixCode;
119 
120     private Type(String prefixCode) {
121       _prefixCode = prefixCode;
122     }
123 
124     public String getPrefixCode() {
125       return _prefixCode;
126     }
127 
128     public abstract CharHandler parseCodes(String[] codeStrings);
129   }
130 
131   /**
132    * Base class for the handlers which hold the text index character encoding
133    * information.
134    */
135   abstract static class CharHandler {
136     public abstract Type getType();
137     public byte[] getInlineBytes(char c) {
138       return null;
139     }
140     public byte[] getExtraBytes() {
141       return null;
142     }
143     public byte[] getUnprintableBytes() {
144       return null;
145     }
146     public byte getExtraByteModifier() {
147       return 0;
148     }
149     public byte getCrazyFlag() {
150       return 0;
151     }
152     public boolean isSignificantChar() {
153       return false;
154     }
155   }
156 
157   /**
158    * CharHandler for Type.SIMPLE
159    */
160   private static final class SimpleCharHandler extends CharHandler {
161     private final byte[] _bytes;
162     private SimpleCharHandler(byte[] bytes) {
163       _bytes = bytes;
164     }
165     @Override public Type getType() {
166       return Type.SIMPLE;
167     }
168     @Override public byte[] getInlineBytes(char c) {
169       return _bytes;
170     }
171   }
172 
173   /**
174    * CharHandler for Type.INTERNATIONAL
175    */
176   private static final class InternationalCharHandler extends CharHandler {
177     private final byte[] _bytes;
178     private final byte[] _extraBytes;
179     private InternationalCharHandler(byte[] bytes, byte[] extraBytes) {
180       _bytes = bytes;
181       _extraBytes = extraBytes;
182     }
183     @Override public Type getType() {
184       return Type.INTERNATIONAL;
185     }
186     @Override public byte[] getInlineBytes(char c) {
187       return _bytes;
188     }
189     @Override public byte[] getExtraBytes() {
190       return _extraBytes;
191     }
192   }
193 
194   /**
195    * CharHandler for Type.UNPRINTABLE
196    */
197   private static final class UnprintableCharHandler extends CharHandler {
198     private final byte[] _unprintBytes;
199     private UnprintableCharHandler(byte[] unprintBytes) {
200       _unprintBytes = unprintBytes;
201     }
202     @Override public Type getType() {
203       return Type.UNPRINTABLE;
204     }
205     @Override public byte[] getUnprintableBytes() {
206       return _unprintBytes;
207     }
208   }
209 
210   /**
211    * CharHandler for Type.UNPRINTABLE_EXT
212    */
213   private static final class UnprintableExtCharHandler extends CharHandler {
214     private final byte _extraByteMod;
215     private UnprintableExtCharHandler(Byte extraByteMod) {
216       _extraByteMod = extraByteMod;
217     }
218     @Override public Type getType() {
219       return Type.UNPRINTABLE_EXT;
220     }
221     @Override public byte getExtraByteModifier() {
222       return _extraByteMod;
223     }
224   }
225 
226   /**
227    * CharHandler for Type.INTERNATIONAL_EXT
228    */
229   private static final class InternationalExtCharHandler extends CharHandler {
230     private final byte[] _bytes;
231     private final byte[] _extraBytes;
232     private final byte _crazyFlag;
233     private InternationalExtCharHandler(byte[] bytes, byte[] extraBytes,
234                                         byte crazyFlag) {
235       _bytes = bytes;
236       _extraBytes = extraBytes;
237       _crazyFlag = crazyFlag;
238     }
239     @Override public Type getType() {
240       return Type.INTERNATIONAL_EXT;
241     }
242     @Override public byte[] getInlineBytes(char c) {
243       return _bytes;
244     }
245     @Override public byte[] getExtraBytes() {
246       return _extraBytes;
247     }
248     @Override public byte getCrazyFlag() {
249       return _crazyFlag;
250     }
251   }
252 
253   /**
254    * CharHandler for Type.SIGNIFICANT
255    */
256   private static final class SignificantCharHandler extends CharHandler {
257     private final byte[] _bytes;
258     private SignificantCharHandler(byte[] bytes) {
259       _bytes = bytes;
260     }
261     @Override public Type getType() {
262       return Type.SIGNIFICANT;
263     }
264     @Override public byte[] getInlineBytes(char c) {
265       return _bytes;
266     }
267     @Override public boolean isSignificantChar() {
268       return true;
269     }
270   }
271 
272   /** shared CharHandler instance for Type.IGNORED */
273   static final CharHandler IGNORED_CHAR_HANDLER = new CharHandler() {
274     @Override public Type getType() {
275       return Type.IGNORED;
276     }
277   };
278 
279   /** the surrogate char bufs are computed on the fly.  re-use a buffer for
280       those */
281   private static final ThreadLocal<byte[]> SURROGATE_CHAR_BUF =
282     ThreadLocal.withInitial(() -> new byte[2]);
283   private static final byte[] SURROGATE_EXTRA_BYTES = {0x3f};
284 
285   private static abstract class SurrogateCharHandler extends CharHandler {
286     @Override public Type getType() {
287       return Type.SURROGATE;
288     }
289     @Override public byte[] getExtraBytes() {
290       return SURROGATE_EXTRA_BYTES;
291     }
292     protected static byte[] toInlineBytes(int idxC) {
293       byte[] bytes = SURROGATE_CHAR_BUF.get();
294       bytes[0] = (byte)((idxC >>> 8) & 0xFF);
295       bytes[1] = (byte)(idxC & 0xFF);
296       return bytes;
297     }
298   }
299 
300   /** shared CharHandler instance for "high surrogate" chars (which are
301       computed) */
302   static final CharHandler HIGH_SURROGATE_CHAR_HANDLER =
303     new SurrogateCharHandler() {
304       @Override public byte[] getInlineBytes(char c) {
305         // the high sorrogate bytes seems to be computed from a fixed offset
306         int idxC = asUnsignedChar(c) - 10238;
307         return toInlineBytes(idxC);
308       }
309     };
310 
311   /** shared CharHandler instance for "low surrogate" chars (which are
312       computed) */
313   static final CharHandler LOW_SURROGATE_CHAR_HANDLER =
314     new SurrogateCharHandler() {
315       @Override public byte[] getInlineBytes(char c) {
316         // the low surrogate bytes are computed with a specific value based in
317         // its location in a 1024 character block.
318         int charOffset = (asUnsignedChar(c) - 0xdc00) % 1024;
319 
320         int idxOffset = 0;
321         if(charOffset < 8) {
322           idxOffset = 9992;
323         } else if(charOffset < (8 + 254)) {
324           idxOffset = 9990;
325         } else if(charOffset < (8 + 254 + 254)) {
326           idxOffset = 9988;
327         } else if(charOffset < (8 + 254 + 254 + 254)) {
328           idxOffset = 9986;
329         } else  {
330           idxOffset = 9984;
331         }
332         int idxC = asUnsignedChar(c) - idxOffset;
333         return toInlineBytes(idxC);
334       }
335     };
336 
337   static final char FIRST_CHAR = (char)0x0000;
338   static final char LAST_CHAR = (char)0x00FF;
339   static final char FIRST_EXT_CHAR = LAST_CHAR + 1;
340   static final char LAST_EXT_CHAR = (char)0xFFFF;
341 
342   private static final class Codes
343   {
344     /** handlers for the first 256 chars.  use nested class to lazy load the
345         handlers */
346     private static final CharHandler[] _values = loadCodes(
347         CODES_FILE, FIRST_CHAR, LAST_CHAR);
348   }
349 
350   private static final class ExtCodes
351   {
352     /** handlers for the rest of the chars in BMP 0.  use nested class to
353         lazy load the handlers */
354     private static final CharHandler[] _values = loadCodes(
355         EXT_CODES_FILE, FIRST_EXT_CHAR, LAST_EXT_CHAR);
356   }
357 
358   static final GeneralLegacyIndexCodes GEN_LEG_INSTANCE =
359     new GeneralLegacyIndexCodes();
360 
361   GeneralLegacyIndexCodes() {
362   }
363 
364   /**
365    * Returns the CharHandler for the given character.
366    */
367   CharHandler getCharHandler(char c)
368   {
369     if(c <= LAST_CHAR) {
370       return Codes._values[c];
371     }
372 
373     int extOffset = asUnsignedChar(c) - asUnsignedChar(FIRST_EXT_CHAR);
374     return ExtCodes._values[extOffset];
375   }
376 
377   /**
378    * Loads the CharHandlers for the given range of characters from the
379    * resource file with the given name.
380    */
381   static CharHandler[] loadCodes(String codesFilePath,
382                                  char firstChar, char lastChar)
383   {
384     int numCodes = (asUnsignedChar(lastChar) - asUnsignedChar(firstChar)) + 1;
385     CharHandler[] values = new CharHandler[numCodes];
386 
387     Map<String,Type> prefixMap = new HashMap<String,Type>();
388     for(Type type : Type.values()) {
389       prefixMap.put(type.getPrefixCode(), type);
390     }
391 
392     BufferedReader reader = null;
393     try {
394 
395       reader = new BufferedReader(
396           new InputStreamReader(
397               DatabaseImpl.getResourceAsStream(codesFilePath), "US-ASCII"));
398 
399       int start = asUnsignedChar(firstChar);
400       int end = asUnsignedChar(lastChar);
401       for(int i = start; i <= end; ++i) {
402         char c = (char)i;
403         CharHandler ch = null;
404         if(Character.isHighSurrogate(c)) {
405           // surrogate chars are not included in the codes files
406           ch = HIGH_SURROGATE_CHAR_HANDLER;
407         } else if(Character.isLowSurrogate(c)) {
408           // surrogate chars are not included in the codes files
409           ch = LOW_SURROGATE_CHAR_HANDLER;
410         } else {
411           String codeLine = reader.readLine();
412           ch = parseCodes(prefixMap, codeLine);
413         }
414         values[(i - start)] = ch;
415       }
416 
417     } catch(IOException e) {
418       throw new RuntimeException("failed loading index codes file " +
419                                  codesFilePath, e);
420     } finally {
421       ByteUtil.closeQuietly(reader);
422     }
423 
424     return values;
425   }
426 
427   /**
428    * Returns a CharHandler parsed from the given line from an index codes
429    * file.
430    */
431   private static CharHandler parseCodes(Map<String,Type> prefixMap,
432                                         String codeLine)
433   {
434     String prefix = codeLine.substring(0, 1);
435     String suffix = ((codeLine.length() > 1) ? codeLine.substring(1) : "");
436     return prefixMap.get(prefix).parseCodes(suffix.split(",", -1));
437   }
438 
439   /**
440    * Returns a SimpleCharHandler parsed from the given index code strings.
441    */
442   private static CharHandler parseSimpleCodes(String[] codeStrings)
443   {
444     if(codeStrings.length != 1) {
445       throw new IllegalStateException("Unexpected code strings " +
446                                       Arrays.asList(codeStrings));
447     }
448     return new SimpleCharHandler(codesToBytes(codeStrings[0], true));
449   }
450 
451   /**
452    * Returns an InternationalCharHandler parsed from the given index code
453    * strings.
454    */
455   private static CharHandler parseInternationalCodes(String[] codeStrings)
456   {
457     if(codeStrings.length != 2) {
458       throw new IllegalStateException("Unexpected code strings " +
459                                       Arrays.asList(codeStrings));
460     }
461     return new InternationalCharHandler(codesToBytes(codeStrings[0], true),
462                                         codesToBytes(codeStrings[1], true));
463   }
464 
465   /**
466    * Returns a UnprintableCharHandler parsed from the given index code
467    * strings.
468    */
469   private static CharHandler parseUnprintableCodes(String[] codeStrings)
470   {
471     if(codeStrings.length != 1) {
472       throw new IllegalStateException("Unexpected code strings " +
473                                       Arrays.asList(codeStrings));
474     }
475     return new UnprintableCharHandler(codesToBytes(codeStrings[0], true));
476   }
477 
478   /**
479    * Returns a UnprintableExtCharHandler parsed from the given index code
480    * strings.
481    */
482   private static CharHandler parseUnprintableExtCodes(String[] codeStrings)
483   {
484     if(codeStrings.length != 1) {
485       throw new IllegalStateException("Unexpected code strings " +
486                                       Arrays.asList(codeStrings));
487     }
488     byte[] bytes = codesToBytes(codeStrings[0], true);
489     if(bytes.length != 1) {
490       throw new IllegalStateException("Unexpected code strings " +
491                                       Arrays.asList(codeStrings));
492     }
493     return new UnprintableExtCharHandler(bytes[0]);
494   }
495 
496   /**
497    * Returns a InternationalExtCharHandler parsed from the given index code
498    * strings.
499    */
500   private static CharHandler parseInternationalExtCodes(String[] codeStrings)
501   {
502     if(codeStrings.length != 3) {
503       throw new IllegalStateException("Unexpected code strings " +
504                                       Arrays.asList(codeStrings));
505     }
506 
507     byte crazyFlag = ("1".equals(codeStrings[2]) ?
508                       CRAZY_CODE_1 : CRAZY_CODE_2);
509     return new InternationalExtCharHandler(codesToBytes(codeStrings[0], true),
510                                            codesToBytes(codeStrings[1], false),
511                                            crazyFlag);
512   }
513 
514   /**
515    * Returns a SignificantCharHandler parsed from the given index code strings.
516    */
517   private static CharHandler parseSignificantCodes(String[] codeStrings)
518   {
519     if(codeStrings.length != 1) {
520       throw new IllegalStateException("Unexpected code strings " +
521                                       Arrays.asList(codeStrings));
522     }
523     return new SignificantCharHandler(codesToBytes(codeStrings[0], true));
524   }
525 
526   /**
527    * Converts a string of hex encoded bytes to a byte[], optionally throwing
528    * an exception if no codes are given.
529    */
530   private static byte[] codesToBytes(String codes, boolean required)
531   {
532     if(codes.length() == 0) {
533       if(required) {
534         throw new IllegalStateException("empty code bytes");
535       }
536       return null;
537     }
538     if((codes.length() % 2) != 0) {
539       // stripped a leading 0
540       codes = "0" + codes;
541     }
542     byte[] bytes = new byte[codes.length() / 2];
543     for(int i = 0; i < bytes.length; ++i) {
544       int charIdx = i*2;
545       bytes[i] = (byte)(Integer.parseInt(codes.substring(charIdx, charIdx + 2),
546                                          16));
547     }
548     return bytes;
549   }
550 
551   /**
552    * Returns an the char value converted to an unsigned char value.  Note, I
553    * think this is unnecessary (I think java treats chars as unsigned), but I
554    * did this just to be on the safe side.
555    */
556   static int asUnsignedChar(char c)
557   {
558     return c & 0xFFFF;
559   }
560 
561   /**
562    * Converts an index value for a text column into the entry value (which
563    * is based on a variety of nifty codes).
564    */
565   void writeNonNullIndexTextValue(
566       Object value, ByteStream bout, boolean isAscending)
567     throws IOException
568   {
569     // convert to string
570     String str = toIndexCharSequence(value);
571 
572     // record previous entry length so we can do any post-processing
573     // necessary for this entry (handling descending)
574     int prevLength = bout.getLength();
575 
576     // now, convert each character to a "code" of one or more bytes
577     ExtraCodesStream extraCodes = null;
578     ByteStream unprintableCodes = null;
579     ByteStream crazyCodes = null;
580     int charOffset = 0;
581     for(int i = 0; i < str.length(); ++i) {
582 
583       char c = str.charAt(i);
584       CharHandler ch = getCharHandler(c);
585 
586       int curCharOffset = charOffset;
587       byte[] bytes = ch.getInlineBytes(c);
588       if(bytes != null) {
589         // write the "inline" codes immediately
590         bout.write(bytes);
591 
592         // only increment the charOffset for chars with inline codes
593         ++charOffset;
594       }
595 
596       if(ch.getType() == Type.SIMPLE) {
597         // common case, skip further code handling
598         continue;
599       }
600 
601       bytes = ch.getExtraBytes();
602       byte extraCodeModifier = ch.getExtraByteModifier();
603       if((bytes != null) || (extraCodeModifier != 0)) {
604         if(extraCodes == null) {
605           extraCodes = new ExtraCodesStream(str.length());
606         }
607 
608         // keep track of the extra codes for later
609         writeExtraCodes(curCharOffset, bytes, extraCodeModifier, extraCodes);
610       }
611 
612       bytes = ch.getUnprintableBytes();
613       if(bytes != null) {
614         if(unprintableCodes == null) {
615           unprintableCodes = new ByteStream();
616         }
617 
618         // keep track of the unprintable codes for later
619         writeUnprintableCodes(curCharOffset, bytes, unprintableCodes,
620                               extraCodes);
621       }
622 
623       byte crazyFlag = ch.getCrazyFlag();
624       if(crazyFlag != 0) {
625         if(crazyCodes == null) {
626           crazyCodes = new ByteStream();
627         }
628 
629         // keep track of the crazy flags for later
630         crazyCodes.write(crazyFlag);
631       }
632     }
633 
634     // write end text flag
635     bout.write(END_TEXT);
636 
637     boolean hasExtraCodes = trimExtraCodes(
638         extraCodes, (byte)0, INTERNATIONAL_EXTRA_PLACEHOLDER);
639     boolean hasUnprintableCodes = (unprintableCodes != null);
640     boolean hasCrazyCodes = (crazyCodes != null);
641     if(hasExtraCodes || hasUnprintableCodes || hasCrazyCodes) {
642 
643       // we write all the international extra bytes first
644       if(hasExtraCodes) {
645         extraCodes.writeTo(bout);
646       }
647 
648       if(hasCrazyCodes || hasUnprintableCodes) {
649 
650         // write 2 more end flags
651         bout.write(END_TEXT);
652         bout.write(END_TEXT);
653 
654         // next come the crazy flags
655         if(hasCrazyCodes) {
656 
657           writeCrazyCodes(crazyCodes, bout);
658 
659           // if we are writing unprintable codes after this, tack on another
660           // code
661           if(hasUnprintableCodes) {
662             bout.write(CRAZY_CODES_UNPRINT_SUFFIX);
663           }
664         }
665 
666         // then we write all the unprintable extra bytes
667         if(hasUnprintableCodes) {
668 
669           // write another end flag
670           bout.write(END_TEXT);
671 
672           unprintableCodes.writeTo(bout);
673         }
674       }
675     }
676 
677     // handle descending order by inverting the bytes
678     if(!isAscending) {
679 
680       // we actually write the end byte before flipping the bytes, and write
681       // another one after flipping
682       bout.write(END_EXTRA_TEXT);
683 
684       // flip the bytes that we have written thus far for this text value
685       IndexData.flipBytes(bout.getBytes(), prevLength,
686                           (bout.getLength() - prevLength));
687     }
688 
689     // write end extra text
690     bout.write(END_EXTRA_TEXT);
691   }
692 
693   protected static String toIndexCharSequence(Object value)
694       throws IOException {
695 
696     // first, convert to string
697     String str = ColumnImpl.toCharSequence(value).toString();
698 
699     // all text columns (including memos) are only indexed up to the max
700     // number of chars in a VARCHAR column
701     int len = str.length();
702     if(len > MAX_TEXT_INDEX_CHAR_LENGTH) {
703       str = str.substring(0, MAX_TEXT_INDEX_CHAR_LENGTH);
704       len = MAX_TEXT_INDEX_CHAR_LENGTH;
705     }
706 
707     // trailing spaces are ignored for text index entries
708     if((len > 0) && (str.charAt(len - 1) == ' ')) {
709       do {
710         --len;
711       } while((len > 0) && (str.charAt(len - 1) == ' '));
712 
713       str = str.substring(0, len);
714     }
715 
716     return str;
717   }
718 
719   /**
720    * Encodes the given extra code info in the given stream.
721    */
722   private static void writeExtraCodes(
723       int charOffset, byte[] bytes, byte extraCodeModifier,
724       ExtraCodesStream extraCodes)
725   {
726     // we fill in a placeholder value for any chars w/out extra codes
727     int numChars = extraCodes.getNumChars();
728     if(numChars < charOffset) {
729       int fillChars = charOffset - numChars;
730       extraCodes.writeFill(fillChars, INTERNATIONAL_EXTRA_PLACEHOLDER);
731       extraCodes.incrementNumChars(fillChars);
732     }
733 
734     if(bytes != null) {
735 
736       // write the actual extra codes and update the number of chars
737       extraCodes.write(bytes);
738       extraCodes.incrementNumChars(1);
739 
740     } else {
741 
742       // extra code modifiers modify the existing extra code bytes and do not
743       // count as additional extra code chars
744       int lastIdx = extraCodes.getLength() - 1;
745       if(lastIdx >= 0) {
746 
747         // the extra code modifier is added to the last extra code written
748         byte lastByte = extraCodes.get(lastIdx);
749         lastByte += extraCodeModifier;
750         extraCodes.set(lastIdx, lastByte);
751 
752       } else {
753 
754         // there is no previous extra code, add a new code (but keep track of
755         // this "unprintable code" prefix)
756         extraCodes.write(extraCodeModifier);
757         extraCodes.setUnprintablePrefixLen(1);
758       }
759     }
760   }
761 
762   /**
763    * Trims any bytes in the given range off of the end of the given stream,
764    * returning whether or not there are any bytes left in the given stream
765    * after trimming.
766    */
767   private static boolean trimExtraCodes(ByteStream extraCodes,
768                                         byte minTrimCode, byte maxTrimCode)
769   {
770     if(extraCodes == null) {
771       return false;
772     }
773 
774     extraCodes.trimTrailing(minTrimCode, maxTrimCode);
775 
776     // anything left?
777     return (extraCodes.getLength() > 0);
778   }
779 
780   /**
781    * Encodes the given unprintable char codes in the given stream.
782    */
783   private static void writeUnprintableCodes(
784       int charOffset, byte[] bytes, ByteStream unprintableCodes,
785       ExtraCodesStream extraCodes)
786   {
787     // the offset seems to be calculated based on the number of bytes in the
788     // "extra codes" part of the entry (even if there are no extra codes bytes
789     // actually written in the final entry).
790     int unprintCharOffset = charOffset;
791     if(extraCodes != null) {
792       // we need to account for some extra codes which have not been written
793       // yet.  additionally, any unprintable bytes added to the beginning of
794       // the extra codes are ignored.
795       unprintCharOffset = extraCodes.getLength() +
796         (charOffset - extraCodes.getNumChars()) -
797         extraCodes.getUnprintablePrefixLen();
798     }
799 
800     // we write a whacky combo of bytes for each unprintable char which
801     // includes a funky offset and extra char itself
802     int offset =
803       (UNPRINTABLE_COUNT_START +
804        (UNPRINTABLE_COUNT_MULTIPLIER * unprintCharOffset))
805       | UNPRINTABLE_OFFSET_FLAGS;
806 
807     // write offset as big-endian short
808     unprintableCodes.write((offset >> 8) & 0xFF);
809     unprintableCodes.write(offset & 0xFF);
810 
811     unprintableCodes.write(UNPRINTABLE_MIDFIX);
812     unprintableCodes.write(bytes);
813   }
814 
815   /**
816    * Encode the given crazy code bytes into the given byte stream.
817    */
818   private static void writeCrazyCodes(ByteStream crazyCodes, ByteStream bout)
819   {
820     // CRAZY_CODE_2 flags at the end are ignored, so ditch them
821     trimExtraCodes(crazyCodes, CRAZY_CODE_2, CRAZY_CODE_2);
822 
823     if(crazyCodes.getLength() > 0) {
824 
825       // the crazy codes get encoded into 6 bit sequences where each code is 2
826       // bits (where the first 2 bits in the byte are a common prefix).
827       byte curByte = CRAZY_CODE_START;
828       int idx = 0;
829       for(int i = 0; i < crazyCodes.getLength(); ++i) {
830         byte nextByte = crazyCodes.get(i);
831         nextByte <<= ((2 - idx) * 2);
832         curByte |= nextByte;
833 
834         ++idx;
835         if(idx == 3) {
836           // write current byte and reset
837           bout.write(curByte);
838           curByte = CRAZY_CODE_START;
839           idx = 0;
840         }
841       }
842 
843       // write last byte
844       if(idx > 0) {
845         bout.write(curByte);
846       }
847     }
848 
849     // write crazy code suffix (note, we write this even if all the codes are
850     // trimmed
851     bout.write(CRAZY_CODES_SUFFIX);
852   }
853 
854   /**
855    * Extension of ByteStream which keeps track of an additional char count and
856    * the length of any "unprintable" code prefix.
857    */
858   private static final class ExtraCodesStream extends ByteStream
859   {
860     private int _numChars;
861     private int _unprintablePrefixLen;
862 
863     private ExtraCodesStream(int length) {
864       super(length);
865     }
866 
867     public int getNumChars() {
868       return _numChars;
869     }
870 
871     public void incrementNumChars(int inc) {
872       _numChars += inc;
873     }
874 
875     public int getUnprintablePrefixLen() {
876       return _unprintablePrefixLen;
877     }
878 
879     public void setUnprintablePrefixLen(int len) {
880       _unprintablePrefixLen = len;
881     }
882   }
883 
884 }