Java Tutorial/I18N/Charset

Материал из Java эксперт
Перейти к: навигация, поиск

Converting Between Strings (Unicode) and Other Character Set Encodings

   <source lang="java">

import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; public class Main {

 public static void main(String[] argv) throws Exception {
   Charset charset = Charset.forName("ISO-8859-1");
   CharsetDecoder decoder = charset.newDecoder();
   CharsetEncoder encoder = charset.newEncoder();
   ByteBuffer bbuf = encoder.encode(CharBuffer.wrap("a string"));
   CharBuffer cbuf = decoder.decode(bbuf);
   String s = cbuf.toString();
 }

}</source>





Detect non-ASCII characters in string

   <source lang="java">

import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.Arrays; public class Main {

 public static void main(String[] args) throws Exception {
   byte[] invalidBytes = "� ".getBytes();
   byte[] validBytes = "(c)".getBytes();
   CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();
   CharBuffer buffer = decoder.decode(ByteBuffer.wrap(validBytes));
   System.out.println(Arrays.toString(buffer.array()));
   buffer = decoder.decode(ByteBuffer.wrap(invalidBytes));
   System.out.println(Arrays.toString(buffer.array()));
 }

}</source>





encoder and decoder use a supplied ByteBuffer

   <source lang="java">

import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; public class Main {

 public static void main(String[] argv) throws Exception {
   Charset charset = Charset.forName("ISO-8859-1");
   CharsetDecoder decoder = charset.newDecoder();
   CharsetEncoder encoder = charset.newEncoder();
   ByteBuffer bbuf = ByteBuffer.allocateDirect(1024);
   CharBuffer cbuf = CharBuffer.allocate(1024);
   encoder.encode(cbuf, bbuf, false);
   bbuf.flip();
   decoder.decode(bbuf, cbuf, false);
   cbuf.flip();
 }

}</source>





extends Charset to create Hex Charset

   <source lang="java">

/*

* HexCharset.java
*
* Created on 22 December 2005, 21:56
*
* To change this template, choose Tools | Options and locate the template under
* the Source Creation and Management node. Right-click the template and choose
* Open. You can then make changes to the template in the Source Editor.
*/

import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; /**

* Codec to translate between hex coding and byte string.
* Hex output is capital if the char set name is given in capitals.
* hex:nn used as a charset name inserts \n after every nnth character.
* @author malcolmm
*/

public class HexCharset extends Charset {

   private final static String codeHEX = "0123456789ABCDEF";
   private final static String codehex = "0123456789abcdef";
   private String codes;
   private Integer measure;
   
   /** Creates a new instance of HexCharset 
    * @param caps true for A-F, false for a-f
    */
   public HexCharset(boolean caps) {
       super(caps ? "HEX" : "hex", new String[]{"HEX"});
       codes = caps ? codeHEX : codehex;
   }
   
   /**
    * Construct the charset
    * @param caps true for A-F, false for a-f
    * @param measure Line width for decoding
    */
   public HexCharset(boolean caps, int measure) {
       super((caps ? "HEX" : "hex") + ":" + measure, new String[]{"HEX"});
       codes = caps ? codeHEX : codehex;
       this.measure = measure;
   }
   /**
    * Constructs a new encoder for this charset. 
    * 
    * @return  A new encoder for this charset
    */
   public CharsetEncoder newEncoder() {
       return new Encoder();
   }
   /**
    * Constructs a new decoder for this charset. 
    * 
    * @return  A new decoder for this charset
    */
   public CharsetDecoder newDecoder() {
       return new Decoder();
   }
   /**
    * Tells whether or not this charset contains the given charset.
    * 
    *  A charset C is said to contain a charset D if,
    * and only if, every character representable in D is also
    * representable in C.  If this relationship holds then it is
    * guaranteed that every string that can be encoded in D can also be
    * encoded in C without performing any replacements.
    * 
    *  That C contains D does not imply that each character
    * representable in C by a particular byte sequence is represented
    * in D by the same byte sequence, although sometimes this is the
    * case.
    * 
    *  Every charset contains itself.
    * 
    *  This method computes an approximation of the containment relation:
    * If it returns true then the given charset is known to be
    * contained by this charset; if it returns false, however, then
    * it is not necessarily the case that the given charset is not contained
    * in this charset.
    * 
    * @return  true if, and only if, the given charset
    *          is contained in this charset
    */
   public boolean contains(Charset cs) {
       return cs instanceof HexCharset;
   }
   
   private class Encoder extends CharsetEncoder {
       private boolean unpaired;
       private int nyble;
       private Encoder() {
           super(HexCharset.this, 0.49f, 1f);
           
       }
       /**
        * Flushes this encoder.
        * 
        *  The default implementation of this method does nothing, and always
        * returns {@link CoderResult#UNDERFLOW}.  This method should be overridden
        * by encoders that may need to write final bytes to the output buffer
        * once the entire input sequence has been read. 
        * 
        * @param  out
        *         The output byte buffer
        * 
        * @return  A coder-result object, either {@link CoderResult#UNDERFLOW} or
        *          {@link CoderResult#OVERFLOW}
        */
       protected java.nio.charset.CoderResult implFlush(java.nio.ByteBuffer out) {
           if(!unpaired) {
               implReset();
               return CoderResult.UNDERFLOW;
           }
           else
               throw new IllegalArgumentException("Hex string must be an even number of digits");
       }
       /**
        * Encodes one or more characters into one or more bytes.
        * 
        *  This method encapsulates the basic encoding loop, encoding as many
        * characters as possible until it either runs out of input, runs out of room
        * in the output buffer, or encounters an encoding error.  This method is
        * invoked by the {@link #encode encode} method, which handles result
        * interpretation and error recovery.
        * 
        *  The buffers are read from, and written to, starting at their current
        * positions.  At most {@link Buffer#remaining in.remaining()} characters
        * will be read, and at most {@link Buffer#remaining out.remaining()}
        * bytes will be written.  The buffers" positions will be advanced to
        * reflect the characters read and the bytes written, but their marks and
        * limits will not be modified.
        * 
        *  This method returns a {@link CoderResult} object to describe its
        * reason for termination, in the same manner as the {@link #encode encode}
        * method.  Most implementations of this method will handle encoding errors
        * by returning an appropriate result object for interpretation by the
        * {@link #encode encode} method.  An optimized implementation may instead
        * examine the relevant error action and implement that action itself.
        * 
        *  An implementation of this method may perform arbitrary lookahead by
        * returning {@link CoderResult#UNDERFLOW} until it receives sufficient
        * input.  
        * 
        * @param  in
        *         The input character buffer
        * 
        * @param  out
        *         The output byte buffer
        * 
        * @return  A coder-result object describing the reason for termination
        */
       public java.nio.charset.CoderResult encodeLoop(java.nio.CharBuffer in, java.nio.ByteBuffer out) {
           while(in.remaining() > 0) {
               if(out.remaining() <= 0)
                   return CoderResult.OVERFLOW;
               char inch = in.get();
               if(!Character.isWhitespace(inch)) {
                   int d = Character.digit(inch, 16);
                   if(d < 0)
                       throw new IllegalArgumentException("Bad hex character " + inch);
                   if(unpaired)
                       out.put((byte)(nyble | d));
                   else
                       nyble = d << 4;
                   unpaired = !unpaired;
               }
           }
           return CoderResult.UNDERFLOW;
       }
       
       /**
        * Clear state
        */
       protected void implReset() {
           unpaired = false;
           nyble = 0;
       }
       
   }
   
   private class Decoder extends CharsetDecoder {
       private int charCount;
       
       private Decoder() {
           super(HexCharset.this, 2f, measure == null ? 2f : 2f + (2f / (float)measure));
       }
       /**
        * Decodes one or more bytes into one or more characters.
        * 
        *  This method encapsulates the basic decoding loop, decoding as many
        * bytes as possible until it either runs out of input, runs out of room
        * in the output buffer, or encounters a decoding error.  This method is
        * invoked by the {@link #decode decode} method, which handles result
        * interpretation and error recovery.
        * 
        *  The buffers are read from, and written to, starting at their current
        * positions.  At most {@link Buffer#remaining in.remaining()} bytes
        * will be read, and at most {@link Buffer#remaining out.remaining()}
        * characters will be written.  The buffers" positions will be advanced to
        * reflect the bytes read and the characters written, but their marks and
        * limits will not be modified.
        * 
        *  This method returns a {@link CoderResult} object to describe its
        * reason for termination, in the same manner as the {@link #decode decode}
        * method.  Most implementations of this method will handle decoding errors
        * by returning an appropriate result object for interpretation by the
        * {@link #decode decode} method.  An optimized implementation may instead
        * examine the relevant error action and implement that action itself.
        * 
        *  An implementation of this method may perform arbitrary lookahead by
        * returning {@link CoderResult#UNDERFLOW} until it receives sufficient
        * input.  
        * 
        * @param  in
        *         The input byte buffer
        * 
        * @param  out
        *         The output character buffer
        * 
        * @return  A coder-result object describing the reason for termination
        */
       public java.nio.charset.CoderResult decodeLoop(java.nio.ByteBuffer in, java.nio.CharBuffer out) {
           while(in.remaining() > 0) {
               if(measure != null && charCount >= measure) {
                   if(out.remaining() == 0)
                       return CoderResult.OVERFLOW;
                   out.put("\n");
                   charCount = 0;
               }
               if(out.remaining() < 2)
                   return CoderResult.OVERFLOW;
               int b = in.get() & 0xff;
               out.put(codes.charAt(b >>> 4));
               out.put(codes.charAt(b & 0x0f));
               charCount += 2;
           }
           return CoderResult.UNDERFLOW;
       }
       /**
        * Resets this decoder, clearing any charset-specific internal state.
        * 
        *  The default implementation of this method does nothing.  This method
        * should be overridden by decoders that maintain internal state.  
        */
       protected void implReset() {
           charCount = 0;
       }
       
   }

}</source>





List Charsets

   <source lang="java">

import java.nio.charset.Charset; import java.util.Iterator; import java.util.Set; import java.util.SortedMap; public class Main {

 static public void main(String args[]) throws Exception {
   SortedMap charsets = Charset.availableCharsets();
   Set names = charsets.keySet();
   for (Iterator e = names.iterator(); e.hasNext();) {
     String name = (String) e.next();
     Charset charset = (Charset) charsets.get(name);
     System.out.println(charset);
     Set aliases = charset.aliases();
     for (Iterator ee = aliases.iterator(); ee.hasNext();) {
       System.out.println("    " + ee.next());
     }
   }
 }

}</source>





Listing All Available Unicode to Character Set Converters

   <source lang="java">

import java.nio.charset.Charset; import java.util.Iterator; import java.util.Map; public class Main {

 public static void main(String[] argv) throws Exception {
   Map map = Charset.availableCharsets();
   Iterator it = map.keySet().iterator();
   while (it.hasNext()) {
     // Get charset name
     String charsetName = (String) it.next();
     // Get charset
     Charset charset = Charset.forName(charsetName);
   }
 }

}</source>





Translate Charset

   <source lang="java">

import java.io.File; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; public class Main {

 static public void main(String args[]) throws Exception {
   File infile = new File("inFilename");
   File outfile = new File("outFilename");
   RandomAccessFile inraf = new RandomAccessFile(infile, "r");
   RandomAccessFile outraf = new RandomAccessFile(outfile, "rw");
   FileChannel finc = inraf.getChannel();
   FileChannel foutc = outraf.getChannel();
   MappedByteBuffer inmbb = finc.map(FileChannel.MapMode.READ_ONLY, 0, (int) infile.length());
   Charset inCharset = Charset.forName("UTF8");
   Charset outCharset = Charset.forName("UTF16");
   CharsetDecoder inDecoder = inCharset.newDecoder();
   CharsetEncoder outEncoder = outCharset.newEncoder();
   CharBuffer cb = inDecoder.decode(inmbb);
   ByteBuffer outbb = outEncoder.encode(cb);
   foutc.write(outbb);
   inraf.close();
   outraf.close();
 }

}</source>