Java Tutorial/I18N/Charset

Материал из Java эксперт
Версия от 05:03, 1 июня 2010; Admin (обсуждение | вклад) (1 версия)
(разн.) ← Предыдущая | Текущая версия (разн.) | Следующая → (разн.)
Перейти к: навигация, поиск

Converting Between Strings (Unicode) and Other Character Set Encodings

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
  public static void main(String[] argv) throws Exception {
    Charset charset = Charset.forName("ISO-8859-1");
    CharsetDecoder decoder = charset.newDecoder();
    CharsetEncoder encoder = charset.newEncoder();
    ByteBuffer bbuf = encoder.encode(CharBuffer.wrap("a string"));
    CharBuffer cbuf = decoder.decode(bbuf);
    String s = cbuf.toString();
  }
}





Detect non-ASCII characters in string

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Arrays;
public class Main {
  public static void main(String[] args) throws Exception {
    byte[] invalidBytes = "� ".getBytes();
    byte[] validBytes = "(c)".getBytes();
    CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();
    CharBuffer buffer = decoder.decode(ByteBuffer.wrap(validBytes));
    System.out.println(Arrays.toString(buffer.array()));
    buffer = decoder.decode(ByteBuffer.wrap(invalidBytes));
    System.out.println(Arrays.toString(buffer.array()));
  }
}





encoder and decoder use a supplied ByteBuffer

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
  public static void main(String[] argv) throws Exception {
    Charset charset = Charset.forName("ISO-8859-1");
    CharsetDecoder decoder = charset.newDecoder();
    CharsetEncoder encoder = charset.newEncoder();
    ByteBuffer bbuf = ByteBuffer.allocateDirect(1024);
    CharBuffer cbuf = CharBuffer.allocate(1024);
    encoder.encode(cbuf, bbuf, false);
    bbuf.flip();
    decoder.decode(bbuf, cbuf, false);
    cbuf.flip();
  }
}





extends Charset to create Hex Charset

/*
 * HexCharset.java
 *
 * Created on 22 December 2005, 21:56
 *
 * To change this template, choose Tools | Options and locate the template under
 * the Source Creation and Management node. Right-click the template and choose
 * Open. You can then make changes to the template in the Source Editor.
 */

import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
/**
 * Codec to translate between hex coding and byte string.
 * Hex output is capital if the char set name is given in capitals.
 * hex:nn used as a charset name inserts \n after every nnth character.
 * @author malcolmm
 */
public class HexCharset extends Charset {
    private final static String codeHEX = "0123456789ABCDEF";
    private final static String codehex = "0123456789abcdef";
    private String codes;
    private Integer measure;
    
    /** Creates a new instance of HexCharset 
     * @param caps true for A-F, false for a-f
     */
    public HexCharset(boolean caps) {
        super(caps ? "HEX" : "hex", new String[]{"HEX"});
        codes = caps ? codeHEX : codehex;
    }
    
    /**
     * Construct the charset
     * @param caps true for A-F, false for a-f
     * @param measure Line width for decoding
     */
    public HexCharset(boolean caps, int measure) {
        super((caps ? "HEX" : "hex") + ":" + measure, new String[]{"HEX"});
        codes = caps ? codeHEX : codehex;
        this.measure = measure;
    }
    /**
     * Constructs a new encoder for this charset. 
     * 
     * @return  A new encoder for this charset
     */
    public CharsetEncoder newEncoder() {
        return new Encoder();
    }
    /**
     * Constructs a new decoder for this charset. 
     * 
     * @return  A new decoder for this charset
     */
    public CharsetDecoder newDecoder() {
        return new Decoder();
    }
    /**
     * Tells whether or not this charset contains the given charset.
     * 
     *  A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if,
     * and only if, every character representable in <i>D</i> is also
     * representable in <i>C</i>.  If this relationship holds then it is
     * guaranteed that every string that can be encoded in <i>D</i> can also be
     * encoded in <i>C</i> without performing any replacements.
     * 
     *  That <i>C</i> contains <i>D</i> does not imply that each character
     * representable in <i>C</i> by a particular byte sequence is represented
     * in <i>D</i> by the same byte sequence, although sometimes this is the
     * case.
     * 
     *  Every charset contains itself.
     * 
     *  This method computes an approximation of the containment relation:
     * If it returns <tt>true</tt> then the given charset is known to be
     * contained by this charset; if it returns <tt>false</tt>, however, then
     * it is not necessarily the case that the given charset is not contained
     * in this charset.
     * 
     * @return  <tt>true</tt> if, and only if, the given charset
     *          is contained in this charset
     */
    public boolean contains(Charset cs) {
        return cs instanceof HexCharset;
    }
    
    private class Encoder extends CharsetEncoder {
        private boolean unpaired;
        private int nyble;
        private Encoder() {
            super(HexCharset.this, 0.49f, 1f);
            
        }
        /**
         * Flushes this encoder.
         * 
         *  The default implementation of this method does nothing, and always
         * returns {@link CoderResult#UNDERFLOW}.  This method should be overridden
         * by encoders that may need to write final bytes to the output buffer
         * once the entire input sequence has been read. 
         * 
         * @param  out
         *         The output byte buffer
         * 
         * @return  A coder-result object, either {@link CoderResult#UNDERFLOW} or
         *          {@link CoderResult#OVERFLOW}
         */
        protected java.nio.charset.CoderResult implFlush(java.nio.ByteBuffer out) {
            if(!unpaired) {
                implReset();
                return CoderResult.UNDERFLOW;
            }
            else
                throw new IllegalArgumentException("Hex string must be an even number of digits");
        }
        /**
         * Encodes one or more characters into one or more bytes.
         * 
         *  This method encapsulates the basic encoding loop, encoding as many
         * characters as possible until it either runs out of input, runs out of room
         * in the output buffer, or encounters an encoding error.  This method is
         * invoked by the {@link #encode encode} method, which handles result
         * interpretation and error recovery.
         * 
         *  The buffers are read from, and written to, starting at their current
         * positions.  At most {@link Buffer#remaining in.remaining()} characters
         * will be read, and at most {@link Buffer#remaining out.remaining()}
         * bytes will be written.  The buffers" positions will be advanced to
         * reflect the characters read and the bytes written, but their marks and
         * limits will not be modified.
         * 
         *  This method returns a {@link CoderResult} object to describe its
         * reason for termination, in the same manner as the {@link #encode encode}
         * method.  Most implementations of this method will handle encoding errors
         * by returning an appropriate result object for interpretation by the
         * {@link #encode encode} method.  An optimized implementation may instead
         * examine the relevant error action and implement that action itself.
         * 
         *  An implementation of this method may perform arbitrary lookahead by
         * returning {@link CoderResult#UNDERFLOW} until it receives sufficient
         * input.  
         * 
         * @param  in
         *         The input character buffer
         * 
         * @param  out
         *         The output byte buffer
         * 
         * @return  A coder-result object describing the reason for termination
         */
        public java.nio.charset.CoderResult encodeLoop(java.nio.CharBuffer in, java.nio.ByteBuffer out) {
            while(in.remaining() > 0) {
                if(out.remaining() <= 0)
                    return CoderResult.OVERFLOW;
                char inch = in.get();
                if(!Character.isWhitespace(inch)) {
                    int d = Character.digit(inch, 16);
                    if(d < 0)
                        throw new IllegalArgumentException("Bad hex character " + inch);
                    if(unpaired)
                        out.put((byte)(nyble | d));
                    else
                        nyble = d << 4;
                    unpaired = !unpaired;
                }
            }
            return CoderResult.UNDERFLOW;
        }
        
        /**
         * Clear state
         */
        protected void implReset() {
            unpaired = false;
            nyble = 0;
        }
        
    }
    
    private class Decoder extends CharsetDecoder {
        private int charCount;
        
        private Decoder() {
            super(HexCharset.this, 2f, measure == null ? 2f : 2f + (2f / (float)measure));
        }
        /**
         * Decodes one or more bytes into one or more characters.
         * 
         *  This method encapsulates the basic decoding loop, decoding as many
         * bytes as possible until it either runs out of input, runs out of room
         * in the output buffer, or encounters a decoding error.  This method is
         * invoked by the {@link #decode decode} method, which handles result
         * interpretation and error recovery.
         * 
         *  The buffers are read from, and written to, starting at their current
         * positions.  At most {@link Buffer#remaining in.remaining()} bytes
         * will be read, and at most {@link Buffer#remaining out.remaining()}
         * characters will be written.  The buffers" positions will be advanced to
         * reflect the bytes read and the characters written, but their marks and
         * limits will not be modified.
         * 
         *  This method returns a {@link CoderResult} object to describe its
         * reason for termination, in the same manner as the {@link #decode decode}
         * method.  Most implementations of this method will handle decoding errors
         * by returning an appropriate result object for interpretation by the
         * {@link #decode decode} method.  An optimized implementation may instead
         * examine the relevant error action and implement that action itself.
         * 
         *  An implementation of this method may perform arbitrary lookahead by
         * returning {@link CoderResult#UNDERFLOW} until it receives sufficient
         * input.  
         * 
         * @param  in
         *         The input byte buffer
         * 
         * @param  out
         *         The output character buffer
         * 
         * @return  A coder-result object describing the reason for termination
         */
        public java.nio.charset.CoderResult decodeLoop(java.nio.ByteBuffer in, java.nio.CharBuffer out) {
            while(in.remaining() > 0) {
                if(measure != null && charCount >= measure) {
                    if(out.remaining() == 0)
                        return CoderResult.OVERFLOW;
                    out.put("\n");
                    charCount = 0;
                }
                if(out.remaining() < 2)
                    return CoderResult.OVERFLOW;
                int b = in.get() & 0xff;
                out.put(codes.charAt(b >>> 4));
                out.put(codes.charAt(b & 0x0f));
                charCount += 2;
            }
            return CoderResult.UNDERFLOW;
        }
        /**
         * Resets this decoder, clearing any charset-specific internal state.
         * 
         *  The default implementation of this method does nothing.  This method
         * should be overridden by decoders that maintain internal state.  
         */
        protected void implReset() {
            charCount = 0;
        }
        
    }
}





List Charsets

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Set;
import java.util.SortedMap;
public class Main {
  static public void main(String args[]) throws Exception {
    SortedMap charsets = Charset.availableCharsets();
    Set names = charsets.keySet();
    for (Iterator e = names.iterator(); e.hasNext();) {
      String name = (String) e.next();
      Charset charset = (Charset) charsets.get(name);
      System.out.println(charset);
      Set aliases = charset.aliases();
      for (Iterator ee = aliases.iterator(); ee.hasNext();) {
        System.out.println("    " + ee.next());
      }
    }
  }
}





Listing All Available Unicode to Character Set Converters

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
public class Main {
  public static void main(String[] argv) throws Exception {
    Map map = Charset.availableCharsets();
    Iterator it = map.keySet().iterator();
    while (it.hasNext()) {
      // Get charset name
      String charsetName = (String) it.next();
      // Get charset
      Charset charset = Charset.forName(charsetName);
    }
  }
}





Translate Charset

import java.io.File;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
  static public void main(String args[]) throws Exception {
    File infile = new File("inFilename");
    File outfile = new File("outFilename");
    RandomAccessFile inraf = new RandomAccessFile(infile, "r");
    RandomAccessFile outraf = new RandomAccessFile(outfile, "rw");
    FileChannel finc = inraf.getChannel();
    FileChannel foutc = outraf.getChannel();
    MappedByteBuffer inmbb = finc.map(FileChannel.MapMode.READ_ONLY, 0, (int) infile.length());
    Charset inCharset = Charset.forName("UTF8");
    Charset outCharset = Charset.forName("UTF16");
    CharsetDecoder inDecoder = inCharset.newDecoder();
    CharsetEncoder outEncoder = outCharset.newEncoder();
    CharBuffer cb = inDecoder.decode(inmbb);
    ByteBuffer outbb = outEncoder.encode(cb);
    foutc.write(outbb);
    inraf.close();
    outraf.close();
  }
}