Java Tutorial/I18N/Charset
Версия от 17:44, 31 мая 2010; (обсуждение)
Содержание
Converting Between Strings (Unicode) and Other Character Set Encodings
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
public static void main(String[] argv) throws Exception {
Charset charset = Charset.forName("ISO-8859-1");
CharsetDecoder decoder = charset.newDecoder();
CharsetEncoder encoder = charset.newEncoder();
ByteBuffer bbuf = encoder.encode(CharBuffer.wrap("a string"));
CharBuffer cbuf = decoder.decode(bbuf);
String s = cbuf.toString();
}
}
Detect non-ASCII characters in string
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Arrays;
public class Main {
public static void main(String[] args) throws Exception {
byte[] invalidBytes = "� ".getBytes();
byte[] validBytes = "(c)".getBytes();
CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();
CharBuffer buffer = decoder.decode(ByteBuffer.wrap(validBytes));
System.out.println(Arrays.toString(buffer.array()));
buffer = decoder.decode(ByteBuffer.wrap(invalidBytes));
System.out.println(Arrays.toString(buffer.array()));
}
}
encoder and decoder use a supplied ByteBuffer
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
public static void main(String[] argv) throws Exception {
Charset charset = Charset.forName("ISO-8859-1");
CharsetDecoder decoder = charset.newDecoder();
CharsetEncoder encoder = charset.newEncoder();
ByteBuffer bbuf = ByteBuffer.allocateDirect(1024);
CharBuffer cbuf = CharBuffer.allocate(1024);
encoder.encode(cbuf, bbuf, false);
bbuf.flip();
decoder.decode(bbuf, cbuf, false);
cbuf.flip();
}
}
extends Charset to create Hex Charset
/*
* HexCharset.java
*
* Created on 22 December 2005, 21:56
*
* To change this template, choose Tools | Options and locate the template under
* the Source Creation and Management node. Right-click the template and choose
* Open. You can then make changes to the template in the Source Editor.
*/
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
/**
* Codec to translate between hex coding and byte string.
* Hex output is capital if the char set name is given in capitals.
* hex:nn used as a charset name inserts \n after every nnth character.
* @author malcolmm
*/
public class HexCharset extends Charset {
private final static String codeHEX = "0123456789ABCDEF";
private final static String codehex = "0123456789abcdef";
private String codes;
private Integer measure;
/** Creates a new instance of HexCharset
* @param caps true for A-F, false for a-f
*/
public HexCharset(boolean caps) {
super(caps ? "HEX" : "hex", new String[]{"HEX"});
codes = caps ? codeHEX : codehex;
}
/**
* Construct the charset
* @param caps true for A-F, false for a-f
* @param measure Line width for decoding
*/
public HexCharset(boolean caps, int measure) {
super((caps ? "HEX" : "hex") + ":" + measure, new String[]{"HEX"});
codes = caps ? codeHEX : codehex;
this.measure = measure;
}
/**
* Constructs a new encoder for this charset.
*
* @return A new encoder for this charset
*/
public CharsetEncoder newEncoder() {
return new Encoder();
}
/**
* Constructs a new decoder for this charset.
*
* @return A new decoder for this charset
*/
public CharsetDecoder newDecoder() {
return new Decoder();
}
/**
* Tells whether or not this charset contains the given charset.
*
* A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if,
* and only if, every character representable in <i>D</i> is also
* representable in <i>C</i>. If this relationship holds then it is
* guaranteed that every string that can be encoded in <i>D</i> can also be
* encoded in <i>C</i> without performing any replacements.
*
* That <i>C</i> contains <i>D</i> does not imply that each character
* representable in <i>C</i> by a particular byte sequence is represented
* in <i>D</i> by the same byte sequence, although sometimes this is the
* case.
*
* Every charset contains itself.
*
* This method computes an approximation of the containment relation:
* If it returns <tt>true</tt> then the given charset is known to be
* contained by this charset; if it returns <tt>false</tt>, however, then
* it is not necessarily the case that the given charset is not contained
* in this charset.
*
* @return <tt>true</tt> if, and only if, the given charset
* is contained in this charset
*/
public boolean contains(Charset cs) {
return cs instanceof HexCharset;
}
private class Encoder extends CharsetEncoder {
private boolean unpaired;
private int nyble;
private Encoder() {
super(HexCharset.this, 0.49f, 1f);
}
/**
* Flushes this encoder.
*
* The default implementation of this method does nothing, and always
* returns {@link CoderResult#UNDERFLOW}. This method should be overridden
* by encoders that may need to write final bytes to the output buffer
* once the entire input sequence has been read.
*
* @param out
* The output byte buffer
*
* @return A coder-result object, either {@link CoderResult#UNDERFLOW} or
* {@link CoderResult#OVERFLOW}
*/
protected java.nio.charset.CoderResult implFlush(java.nio.ByteBuffer out) {
if(!unpaired) {
implReset();
return CoderResult.UNDERFLOW;
}
else
throw new IllegalArgumentException("Hex string must be an even number of digits");
}
/**
* Encodes one or more characters into one or more bytes.
*
* This method encapsulates the basic encoding loop, encoding as many
* characters as possible until it either runs out of input, runs out of room
* in the output buffer, or encounters an encoding error. This method is
* invoked by the {@link #encode encode} method, which handles result
* interpretation and error recovery.
*
* The buffers are read from, and written to, starting at their current
* positions. At most {@link Buffer#remaining in.remaining()} characters
* will be read, and at most {@link Buffer#remaining out.remaining()}
* bytes will be written. The buffers" positions will be advanced to
* reflect the characters read and the bytes written, but their marks and
* limits will not be modified.
*
* This method returns a {@link CoderResult} object to describe its
* reason for termination, in the same manner as the {@link #encode encode}
* method. Most implementations of this method will handle encoding errors
* by returning an appropriate result object for interpretation by the
* {@link #encode encode} method. An optimized implementation may instead
* examine the relevant error action and implement that action itself.
*
* An implementation of this method may perform arbitrary lookahead by
* returning {@link CoderResult#UNDERFLOW} until it receives sufficient
* input.
*
* @param in
* The input character buffer
*
* @param out
* The output byte buffer
*
* @return A coder-result object describing the reason for termination
*/
public java.nio.charset.CoderResult encodeLoop(java.nio.CharBuffer in, java.nio.ByteBuffer out) {
while(in.remaining() > 0) {
if(out.remaining() <= 0)
return CoderResult.OVERFLOW;
char inch = in.get();
if(!Character.isWhitespace(inch)) {
int d = Character.digit(inch, 16);
if(d < 0)
throw new IllegalArgumentException("Bad hex character " + inch);
if(unpaired)
out.put((byte)(nyble | d));
else
nyble = d << 4;
unpaired = !unpaired;
}
}
return CoderResult.UNDERFLOW;
}
/**
* Clear state
*/
protected void implReset() {
unpaired = false;
nyble = 0;
}
}
private class Decoder extends CharsetDecoder {
private int charCount;
private Decoder() {
super(HexCharset.this, 2f, measure == null ? 2f : 2f + (2f / (float)measure));
}
/**
* Decodes one or more bytes into one or more characters.
*
* This method encapsulates the basic decoding loop, decoding as many
* bytes as possible until it either runs out of input, runs out of room
* in the output buffer, or encounters a decoding error. This method is
* invoked by the {@link #decode decode} method, which handles result
* interpretation and error recovery.
*
* The buffers are read from, and written to, starting at their current
* positions. At most {@link Buffer#remaining in.remaining()} bytes
* will be read, and at most {@link Buffer#remaining out.remaining()}
* characters will be written. The buffers" positions will be advanced to
* reflect the bytes read and the characters written, but their marks and
* limits will not be modified.
*
* This method returns a {@link CoderResult} object to describe its
* reason for termination, in the same manner as the {@link #decode decode}
* method. Most implementations of this method will handle decoding errors
* by returning an appropriate result object for interpretation by the
* {@link #decode decode} method. An optimized implementation may instead
* examine the relevant error action and implement that action itself.
*
* An implementation of this method may perform arbitrary lookahead by
* returning {@link CoderResult#UNDERFLOW} until it receives sufficient
* input.
*
* @param in
* The input byte buffer
*
* @param out
* The output character buffer
*
* @return A coder-result object describing the reason for termination
*/
public java.nio.charset.CoderResult decodeLoop(java.nio.ByteBuffer in, java.nio.CharBuffer out) {
while(in.remaining() > 0) {
if(measure != null && charCount >= measure) {
if(out.remaining() == 0)
return CoderResult.OVERFLOW;
out.put("\n");
charCount = 0;
}
if(out.remaining() < 2)
return CoderResult.OVERFLOW;
int b = in.get() & 0xff;
out.put(codes.charAt(b >>> 4));
out.put(codes.charAt(b & 0x0f));
charCount += 2;
}
return CoderResult.UNDERFLOW;
}
/**
* Resets this decoder, clearing any charset-specific internal state.
*
* The default implementation of this method does nothing. This method
* should be overridden by decoders that maintain internal state.
*/
protected void implReset() {
charCount = 0;
}
}
}
List Charsets
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Set;
import java.util.SortedMap;
public class Main {
static public void main(String args[]) throws Exception {
SortedMap charsets = Charset.availableCharsets();
Set names = charsets.keySet();
for (Iterator e = names.iterator(); e.hasNext();) {
String name = (String) e.next();
Charset charset = (Charset) charsets.get(name);
System.out.println(charset);
Set aliases = charset.aliases();
for (Iterator ee = aliases.iterator(); ee.hasNext();) {
System.out.println(" " + ee.next());
}
}
}
}
Listing All Available Unicode to Character Set Converters
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
public class Main {
public static void main(String[] argv) throws Exception {
Map map = Charset.availableCharsets();
Iterator it = map.keySet().iterator();
while (it.hasNext()) {
// Get charset name
String charsetName = (String) it.next();
// Get charset
Charset charset = Charset.forName(charsetName);
}
}
}
Translate Charset
import java.io.File;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
public class Main {
static public void main(String args[]) throws Exception {
File infile = new File("inFilename");
File outfile = new File("outFilename");
RandomAccessFile inraf = new RandomAccessFile(infile, "r");
RandomAccessFile outraf = new RandomAccessFile(outfile, "rw");
FileChannel finc = inraf.getChannel();
FileChannel foutc = outraf.getChannel();
MappedByteBuffer inmbb = finc.map(FileChannel.MapMode.READ_ONLY, 0, (int) infile.length());
Charset inCharset = Charset.forName("UTF8");
Charset outCharset = Charset.forName("UTF16");
CharsetDecoder inDecoder = inCharset.newDecoder();
CharsetEncoder outEncoder = outCharset.newEncoder();
CharBuffer cb = inDecoder.decode(inmbb);
ByteBuffer outbb = outEncoder.encode(cb);
foutc.write(outbb);
inraf.close();
outraf.close();
}
}