Java Tutorial/I18N/Charset
Содержание
Converting Between Strings (Unicode) and Other Character Set Encodings
<source lang="java">
import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; public class Main {
public static void main(String[] argv) throws Exception { Charset charset = Charset.forName("ISO-8859-1"); CharsetDecoder decoder = charset.newDecoder(); CharsetEncoder encoder = charset.newEncoder(); ByteBuffer bbuf = encoder.encode(CharBuffer.wrap("a string")); CharBuffer cbuf = decoder.decode(bbuf); String s = cbuf.toString(); }
}</source>
Detect non-ASCII characters in string
<source lang="java">
import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.Arrays; public class Main {
public static void main(String[] args) throws Exception { byte[] invalidBytes = "� ".getBytes(); byte[] validBytes = "(c)".getBytes(); CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder(); CharBuffer buffer = decoder.decode(ByteBuffer.wrap(validBytes)); System.out.println(Arrays.toString(buffer.array())); buffer = decoder.decode(ByteBuffer.wrap(invalidBytes)); System.out.println(Arrays.toString(buffer.array())); }
}</source>
encoder and decoder use a supplied ByteBuffer
<source lang="java">
import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; public class Main {
public static void main(String[] argv) throws Exception { Charset charset = Charset.forName("ISO-8859-1"); CharsetDecoder decoder = charset.newDecoder(); CharsetEncoder encoder = charset.newEncoder(); ByteBuffer bbuf = ByteBuffer.allocateDirect(1024); CharBuffer cbuf = CharBuffer.allocate(1024); encoder.encode(cbuf, bbuf, false); bbuf.flip(); decoder.decode(bbuf, cbuf, false); cbuf.flip(); }
}</source>
extends Charset to create Hex Charset
<source lang="java">
/*
* HexCharset.java * * Created on 22 December 2005, 21:56 * * To change this template, choose Tools | Options and locate the template under * the Source Creation and Management node. Right-click the template and choose * Open. You can then make changes to the template in the Source Editor. */
import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; /**
* Codec to translate between hex coding and byte string. * Hex output is capital if the char set name is given in capitals. * hex:nn used as a charset name inserts \n after every nnth character. * @author malcolmm */
public class HexCharset extends Charset {
private final static String codeHEX = "0123456789ABCDEF"; private final static String codehex = "0123456789abcdef"; private String codes; private Integer measure; /** Creates a new instance of HexCharset * @param caps true for A-F, false for a-f */ public HexCharset(boolean caps) { super(caps ? "HEX" : "hex", new String[]{"HEX"}); codes = caps ? codeHEX : codehex; } /** * Construct the charset * @param caps true for A-F, false for a-f * @param measure Line width for decoding */ public HexCharset(boolean caps, int measure) { super((caps ? "HEX" : "hex") + ":" + measure, new String[]{"HEX"}); codes = caps ? codeHEX : codehex; this.measure = measure; } /** * Constructs a new encoder for this charset. * * @return A new encoder for this charset */ public CharsetEncoder newEncoder() { return new Encoder(); } /** * Constructs a new decoder for this charset. * * @return A new decoder for this charset */ public CharsetDecoder newDecoder() { return new Decoder(); } /** * Tells whether or not this charset contains the given charset. * * A charset C is said to contain a charset D if, * and only if, every character representable in D is also * representable in C. If this relationship holds then it is * guaranteed that every string that can be encoded in D can also be * encoded in C without performing any replacements. * * That C contains D does not imply that each character * representable in C by a particular byte sequence is represented * in D by the same byte sequence, although sometimes this is the * case. * * Every charset contains itself. * * This method computes an approximation of the containment relation: * If it returns true then the given charset is known to be * contained by this charset; if it returns false, however, then * it is not necessarily the case that the given charset is not contained * in this charset. * * @return true if, and only if, the given charset * is contained in this charset */ public boolean contains(Charset cs) { return cs instanceof HexCharset; } private class Encoder extends CharsetEncoder { private boolean unpaired; private int nyble; private Encoder() { super(HexCharset.this, 0.49f, 1f); } /** * Flushes this encoder. * * The default implementation of this method does nothing, and always * returns {@link CoderResult#UNDERFLOW}. This method should be overridden * by encoders that may need to write final bytes to the output buffer * once the entire input sequence has been read. * * @param out * The output byte buffer * * @return A coder-result object, either {@link CoderResult#UNDERFLOW} or * {@link CoderResult#OVERFLOW} */ protected java.nio.charset.CoderResult implFlush(java.nio.ByteBuffer out) { if(!unpaired) { implReset(); return CoderResult.UNDERFLOW; } else throw new IllegalArgumentException("Hex string must be an even number of digits"); } /** * Encodes one or more characters into one or more bytes. * * This method encapsulates the basic encoding loop, encoding as many * characters as possible until it either runs out of input, runs out of room * in the output buffer, or encounters an encoding error. This method is * invoked by the {@link #encode encode} method, which handles result * interpretation and error recovery. * * The buffers are read from, and written to, starting at their current * positions. At most {@link Buffer#remaining in.remaining()} characters * will be read, and at most {@link Buffer#remaining out.remaining()} * bytes will be written. The buffers" positions will be advanced to * reflect the characters read and the bytes written, but their marks and * limits will not be modified. * * This method returns a {@link CoderResult} object to describe its * reason for termination, in the same manner as the {@link #encode encode} * method. Most implementations of this method will handle encoding errors * by returning an appropriate result object for interpretation by the * {@link #encode encode} method. An optimized implementation may instead * examine the relevant error action and implement that action itself. * * An implementation of this method may perform arbitrary lookahead by * returning {@link CoderResult#UNDERFLOW} until it receives sufficient * input. * * @param in * The input character buffer * * @param out * The output byte buffer * * @return A coder-result object describing the reason for termination */ public java.nio.charset.CoderResult encodeLoop(java.nio.CharBuffer in, java.nio.ByteBuffer out) { while(in.remaining() > 0) { if(out.remaining() <= 0) return CoderResult.OVERFLOW; char inch = in.get(); if(!Character.isWhitespace(inch)) { int d = Character.digit(inch, 16); if(d < 0) throw new IllegalArgumentException("Bad hex character " + inch); if(unpaired) out.put((byte)(nyble | d)); else nyble = d << 4; unpaired = !unpaired; } } return CoderResult.UNDERFLOW; } /** * Clear state */ protected void implReset() { unpaired = false; nyble = 0; } } private class Decoder extends CharsetDecoder { private int charCount; private Decoder() { super(HexCharset.this, 2f, measure == null ? 2f : 2f + (2f / (float)measure)); } /** * Decodes one or more bytes into one or more characters. * * This method encapsulates the basic decoding loop, decoding as many * bytes as possible until it either runs out of input, runs out of room * in the output buffer, or encounters a decoding error. This method is * invoked by the {@link #decode decode} method, which handles result * interpretation and error recovery. * * The buffers are read from, and written to, starting at their current * positions. At most {@link Buffer#remaining in.remaining()} bytes * will be read, and at most {@link Buffer#remaining out.remaining()} * characters will be written. The buffers" positions will be advanced to * reflect the bytes read and the characters written, but their marks and * limits will not be modified. * * This method returns a {@link CoderResult} object to describe its * reason for termination, in the same manner as the {@link #decode decode} * method. Most implementations of this method will handle decoding errors * by returning an appropriate result object for interpretation by the * {@link #decode decode} method. An optimized implementation may instead * examine the relevant error action and implement that action itself. * * An implementation of this method may perform arbitrary lookahead by * returning {@link CoderResult#UNDERFLOW} until it receives sufficient * input. * * @param in * The input byte buffer * * @param out * The output character buffer * * @return A coder-result object describing the reason for termination */ public java.nio.charset.CoderResult decodeLoop(java.nio.ByteBuffer in, java.nio.CharBuffer out) { while(in.remaining() > 0) { if(measure != null && charCount >= measure) { if(out.remaining() == 0) return CoderResult.OVERFLOW; out.put("\n"); charCount = 0; } if(out.remaining() < 2) return CoderResult.OVERFLOW; int b = in.get() & 0xff; out.put(codes.charAt(b >>> 4)); out.put(codes.charAt(b & 0x0f)); charCount += 2; } return CoderResult.UNDERFLOW; } /** * Resets this decoder, clearing any charset-specific internal state. * * The default implementation of this method does nothing. This method * should be overridden by decoders that maintain internal state. */ protected void implReset() { charCount = 0; } }
}</source>
List Charsets
<source lang="java">
import java.nio.charset.Charset; import java.util.Iterator; import java.util.Set; import java.util.SortedMap; public class Main {
static public void main(String args[]) throws Exception { SortedMap charsets = Charset.availableCharsets(); Set names = charsets.keySet(); for (Iterator e = names.iterator(); e.hasNext();) { String name = (String) e.next(); Charset charset = (Charset) charsets.get(name); System.out.println(charset); Set aliases = charset.aliases(); for (Iterator ee = aliases.iterator(); ee.hasNext();) { System.out.println(" " + ee.next()); } } }
}</source>
Listing All Available Unicode to Character Set Converters
<source lang="java">
import java.nio.charset.Charset; import java.util.Iterator; import java.util.Map; public class Main {
public static void main(String[] argv) throws Exception { Map map = Charset.availableCharsets(); Iterator it = map.keySet().iterator(); while (it.hasNext()) { // Get charset name String charsetName = (String) it.next(); // Get charset Charset charset = Charset.forName(charsetName); } }
}</source>
Translate Charset
<source lang="java">
import java.io.File; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; public class Main {
static public void main(String args[]) throws Exception { File infile = new File("inFilename"); File outfile = new File("outFilename"); RandomAccessFile inraf = new RandomAccessFile(infile, "r"); RandomAccessFile outraf = new RandomAccessFile(outfile, "rw"); FileChannel finc = inraf.getChannel(); FileChannel foutc = outraf.getChannel(); MappedByteBuffer inmbb = finc.map(FileChannel.MapMode.READ_ONLY, 0, (int) infile.length()); Charset inCharset = Charset.forName("UTF8"); Charset outCharset = Charset.forName("UTF16"); CharsetDecoder inDecoder = inCharset.newDecoder(); CharsetEncoder outEncoder = outCharset.newEncoder(); CharBuffer cb = inDecoder.decode(inmbb); ByteBuffer outbb = outEncoder.encode(cb); foutc.write(outbb); inraf.close(); outraf.close(); }
}</source>