Java Tutorial/I18N/Charset

Содержание

1 Converting Between Strings (Unicode) and Other Character Set Encodings
2 Detect non-ASCII characters in string
3 encoder and decoder use a supplied ByteBuffer
4 extends Charset to create Hex Charset
5 List Charsets
6 Listing All Available Unicode to Character Set Converters
7 Translate Charset

Converting Between Strings (Unicode) and Other Character Set Encodings

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
  public static void main(String[] argv) throws Exception {
    Charset charset = Charset.forName("ISO-8859-1");
    CharsetDecoder decoder = charset.newDecoder();
    CharsetEncoder encoder = charset.newEncoder();
    ByteBuffer bbuf = encoder.encode(CharBuffer.wrap("a string"));
    CharBuffer cbuf = decoder.decode(bbuf);
    String s = cbuf.toString();
  }
}

Detect non-ASCII characters in string

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Arrays;
public class Main {
  public static void main(String[] args) throws Exception {
    byte[] invalidBytes = "� ".getBytes();
    byte[] validBytes = "(c)".getBytes();
    CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();
    CharBuffer buffer = decoder.decode(ByteBuffer.wrap(validBytes));
    System.out.println(Arrays.toString(buffer.array()));
    buffer = decoder.decode(ByteBuffer.wrap(invalidBytes));
    System.out.println(Arrays.toString(buffer.array()));
  }
}

encoder and decoder use a supplied ByteBuffer

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
  public static void main(String[] argv) throws Exception {
    Charset charset = Charset.forName("ISO-8859-1");
    CharsetDecoder decoder = charset.newDecoder();
    CharsetEncoder encoder = charset.newEncoder();
    ByteBuffer bbuf = ByteBuffer.allocateDirect(1024);
    CharBuffer cbuf = CharBuffer.allocate(1024);
    encoder.encode(cbuf, bbuf, false);
    bbuf.flip();
    decoder.decode(bbuf, cbuf, false);
    cbuf.flip();
  }
}

extends Charset to create Hex Charset

/*
 * HexCharset.java
 *
 * Created on 22 December 2005, 21:56
 *
 * To change this template, choose Tools | Options and locate the template under
 * the Source Creation and Management node. Right-click the template and choose
 * Open. You can then make changes to the template in the Source Editor.
 */

import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
/**
 * Codec to translate between hex coding and byte string.
 * Hex output is capital if the char set name is given in capitals.
 * hex:nn used as a charset name inserts \n after every nnth character.
 * @author malcolmm
 */
public class HexCharset extends Charset {
    private final static String codeHEX = "0123456789ABCDEF";
    private final static String codehex = "0123456789abcdef";
    private String codes;
    private Integer measure;
    
    /** Creates a new instance of HexCharset 
     * @param caps true for A-F, false for a-f
     */
    public HexCharset(boolean caps) {
        super(caps ? "HEX" : "hex", new String[]{"HEX"});
        codes = caps ? codeHEX : codehex;
    }
    
    /**
     * Construct the charset
     * @param caps true for A-F, false for a-f
     * @param measure Line width for decoding
     */
    public HexCharset(boolean caps, int measure) {
        super((caps ? "HEX" : "hex") + ":" + measure, new String[]{"HEX"});
        codes = caps ? codeHEX : codehex;
        this.measure = measure;
    }
    /**
     * Constructs a new encoder for this charset. 
     * 
     * @return  A new encoder for this charset
     */
    public CharsetEncoder newEncoder() {
        return new Encoder();
    }
    /**
     * Constructs a new decoder for this charset. 
     * 
     * @return  A new decoder for this charset
     */
    public CharsetDecoder newDecoder() {
        return new Decoder();
    }
    /**
     * Tells whether or not this charset contains the given charset.
     * 
     *  A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if,
     * and only if, every character representable in <i>D</i> is also
     * representable in <i>C</i>.  If this relationship holds then it is
     * guaranteed that every string that can be encoded in <i>D</i> can also be
     * encoded in <i>C</i> without performing any replacements.
     * 
     *  That <i>C</i> contains <i>D</i> does not imply that each character
     * representable in <i>C</i> by a particular byte sequence is represented
     * in <i>D</i> by the same byte sequence, although sometimes this is the
     * case.
     * 
     *  Every charset contains itself.
     * 
     *  This method computes an approximation of the containment relation:
     * If it returns <tt>true</tt> then the given charset is known to be
     * contained by this charset; if it returns <tt>false</tt>, however, then
     * it is not necessarily the case that the given charset is not contained
     * in this charset.
     * 
     * @return  <tt>true</tt> if, and only if, the given charset
     *          is contained in this charset
     */
    public boolean contains(Charset cs) {
        return cs instanceof HexCharset;
    }
    
    private class Encoder extends CharsetEncoder {
        private boolean unpaired;
        private int nyble;
        private Encoder() {
            super(HexCharset.this, 0.49f, 1f);
            
        }
        /**
         * Flushes this encoder.
         * 
         *  The default implementation of this method does nothing, and always
         * returns {@link CoderResult#UNDERFLOW}.  This method should be overridden
         * by encoders that may need to write final bytes to the output buffer
         * once the entire input sequence has been read. 
         * 
         * @param  out
         *         The output byte buffer
         * 
         * @return  A coder-result object, either {@link CoderResult#UNDERFLOW} or
         *          {@link CoderResult#OVERFLOW}
         */
        protected java.nio.charset.CoderResult implFlush(java.nio.ByteBuffer out) {
            if(!unpaired) {
                implReset();
                return CoderResult.UNDERFLOW;
            }
            else
                throw new IllegalArgumentException("Hex string must be an even number of digits");
        }
        /**
         * Encodes one or more characters into one or more bytes.
         * 
         *  This method encapsulates the basic encoding loop, encoding as many
         * characters as possible until it either runs out of input, runs out of room
         * in the output buffer, or encounters an encoding error.  This method is
         * invoked by the {@link #encode encode} method, which handles result
         * interpretation and error recovery.
         * 
         *  The buffers are read from, and written to, starting at their current
         * positions.  At most {@link Buffer#remaining in.remaining()} characters
         * will be read, and at most {@link Buffer#remaining out.remaining()}
         * bytes will be written.  The buffers" positions will be advanced to
         * reflect the characters read and the bytes written, but their marks and
         * limits will not be modified.
         * 
         *  This method returns a {@link CoderResult} object to describe its
         * reason for termination, in the same manner as the {@link #encode encode}
         * method.  Most implementations of this method will handle encoding errors
         * by returning an appropriate result object for interpretation by the
         * {@link #encode encode} method.  An optimized implementation may instead
         * examine the relevant error action and implement that action itself.
         * 
         *  An implementation of this method may perform arbitrary lookahead by
         * returning {@link CoderResult#UNDERFLOW} until it receives sufficient
         * input.  
         * 
         * @param  in
         *         The input character buffer
         * 
         * @param  out
         *         The output byte buffer
         * 
         * @return  A coder-result object describing the reason for termination
         */
        public java.nio.charset.CoderResult encodeLoop(java.nio.CharBuffer in, java.nio.ByteBuffer out) {
            while(in.remaining() > 0) {
                if(out.remaining() <= 0)
                    return CoderResult.OVERFLOW;
                char inch = in.get();
                if(!Character.isWhitespace(inch)) {
                    int d = Character.digit(inch, 16);
                    if(d < 0)
                        throw new IllegalArgumentException("Bad hex character " + inch);
                    if(unpaired)
                        out.put((byte)(nyble | d));
                    else
                        nyble = d << 4;
                    unpaired = !unpaired;
                }
            }
            return CoderResult.UNDERFLOW;
        }
        
        /**
         * Clear state
         */
        protected void implReset() {
            unpaired = false;
            nyble = 0;
        }
        
    }
    
    private class Decoder extends CharsetDecoder {
        private int charCount;
        
        private Decoder() {
            super(HexCharset.this, 2f, measure == null ? 2f : 2f + (2f / (float)measure));
        }
        /**
         * Decodes one or more bytes into one or more characters.
         * 
         *  This method encapsulates the basic decoding loop, decoding as many
         * bytes as possible until it either runs out of input, runs out of room
         * in the output buffer, or encounters a decoding error.  This method is
         * invoked by the {@link #decode decode} method, which handles result
         * interpretation and error recovery.
         * 
         *  The buffers are read from, and written to, starting at their current
         * positions.  At most {@link Buffer#remaining in.remaining()} bytes
         * will be read, and at most {@link Buffer#remaining out.remaining()}
         * characters will be written.  The buffers" positions will be advanced to
         * reflect the bytes read and the characters written, but their marks and
         * limits will not be modified.
         * 
         *  This method returns a {@link CoderResult} object to describe its
         * reason for termination, in the same manner as the {@link #decode decode}
         * method.  Most implementations of this method will handle decoding errors
         * by returning an appropriate result object for interpretation by the
         * {@link #decode decode} method.  An optimized implementation may instead
         * examine the relevant error action and implement that action itself.
         * 
         *  An implementation of this method may perform arbitrary lookahead by
         * returning {@link CoderResult#UNDERFLOW} until it receives sufficient
         * input.  
         * 
         * @param  in
         *         The input byte buffer
         * 
         * @param  out
         *         The output character buffer
         * 
         * @return  A coder-result object describing the reason for termination
         */
        public java.nio.charset.CoderResult decodeLoop(java.nio.ByteBuffer in, java.nio.CharBuffer out) {
            while(in.remaining() > 0) {
                if(measure != null && charCount >= measure) {
                    if(out.remaining() == 0)
                        return CoderResult.OVERFLOW;
                    out.put("\n");
                    charCount = 0;
                }
                if(out.remaining() < 2)
                    return CoderResult.OVERFLOW;
                int b = in.get() & 0xff;
                out.put(codes.charAt(b >>> 4));
                out.put(codes.charAt(b & 0x0f));
                charCount += 2;
            }
            return CoderResult.UNDERFLOW;
        }
        /**
         * Resets this decoder, clearing any charset-specific internal state.
         * 
         *  The default implementation of this method does nothing.  This method
         * should be overridden by decoders that maintain internal state.  
         */
        protected void implReset() {
            charCount = 0;
        }
        
    }
}

List Charsets

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Set;
import java.util.SortedMap;
public class Main {
  static public void main(String args[]) throws Exception {
    SortedMap charsets = Charset.availableCharsets();
    Set names = charsets.keySet();
    for (Iterator e = names.iterator(); e.hasNext();) {
      String name = (String) e.next();
      Charset charset = (Charset) charsets.get(name);
      System.out.println(charset);
      Set aliases = charset.aliases();
      for (Iterator ee = aliases.iterator(); ee.hasNext();) {
        System.out.println("    " + ee.next());
      }
    }
  }
}

Listing All Available Unicode to Character Set Converters

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
public class Main {
  public static void main(String[] argv) throws Exception {
    Map map = Charset.availableCharsets();
    Iterator it = map.keySet().iterator();
    while (it.hasNext()) {
      // Get charset name
      String charsetName = (String) it.next();
      // Get charset
      Charset charset = Charset.forName(charsetName);
    }
  }
}

Translate Charset

import java.io.File;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
  static public void main(String args[]) throws Exception {
    File infile = new File("inFilename");
    File outfile = new File("outFilename");
    RandomAccessFile inraf = new RandomAccessFile(infile, "r");
    RandomAccessFile outraf = new RandomAccessFile(outfile, "rw");
    FileChannel finc = inraf.getChannel();
    FileChannel foutc = outraf.getChannel();
    MappedByteBuffer inmbb = finc.map(FileChannel.MapMode.READ_ONLY, 0, (int) infile.length());
    Charset inCharset = Charset.forName("UTF8");
    Charset outCharset = Charset.forName("UTF16");
    CharsetDecoder inDecoder = inCharset.newDecoder();
    CharsetEncoder outEncoder = outCharset.newEncoder();
    CharBuffer cb = inDecoder.decode(inmbb);
    ByteBuffer outbb = outEncoder.encode(cb);
    foutc.write(outbb);
    inraf.close();
    outraf.close();
  }
}

Java Tutorial/I18N/Charset

Содержание

Converting Between Strings (Unicode) and Other Character Set Encodings

Detect non-ASCII characters in string

encoder and decoder use a supplied ByteBuffer

extends Charset to create Hex Charset

List Charsets

Listing All Available Unicode to Character Set Converters

Translate Charset

Навигация

Персональные инструменты

Пространства имён

Варианты

Просмотры

Ещё

Поиск

Разделы

Навигация

Инструменты