[Mulgara-svn] r1719 - in trunk/src/jar/query/java/org: jrdf/graph jrdf/util mulgara/query/rdf
pag at mulgara.org
pag at mulgara.org
Fri Jun 5 04:04:37 UTC 2009
Author: pag
Date: 2009-06-04 21:04:36 -0700 (Thu, 04 Jun 2009)
New Revision: 1719
Modified:
trunk/src/jar/query/java/org/jrdf/graph/AbstractLiteral.java
trunk/src/jar/query/java/org/jrdf/util/EscapeUtil.java
trunk/src/jar/query/java/org/jrdf/util/EscapeUtilUnitTest.java
trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImpl.java
trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImplUnitTest.java
Log:
Updated to perform escapes on the UTF-8 encoding that ARP provides. This will fall back to string escapes if a UTF-8 byte sequence is discovered to be invalid
Modified: trunk/src/jar/query/java/org/jrdf/graph/AbstractLiteral.java
===================================================================
--- trunk/src/jar/query/java/org/jrdf/graph/AbstractLiteral.java 2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/jrdf/graph/AbstractLiteral.java 2009-06-05 04:04:36 UTC (rev 1719)
@@ -215,7 +215,7 @@
* @return this instance in N-Triples format
*/
public String getEscapedForm() {
- String escaped = EscapeUtil.escape(getLexicalForm());
+ String escaped = EscapeUtil.escapeUTF8(getLexicalForm());
return '\"' + escaped + '\"' + appendType();
}
Modified: trunk/src/jar/query/java/org/jrdf/util/EscapeUtil.java
===================================================================
--- trunk/src/jar/query/java/org/jrdf/util/EscapeUtil.java 2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/jrdf/util/EscapeUtil.java 2009-06-05 04:04:36 UTC (rev 1719)
@@ -7,144 +7,293 @@
* A utility which applies N-Triples escaping.
*
* @author Andrew Newman
+ * @author Paul Gearon
* @version $Revision: 624 $
*/
public class EscapeUtil {
- /**
- * A regular expression to pick out characters needing escape from Unicode to
- * ASCII. A different regular expression is used depending on which version of the JDK is detected - Java 1.4 has
- * different character support compared with 1.5 and above.
- * <p/>
- * This is used by the {@link #escape} method.
- */
- private static Pattern pattern;
+ /**
+ * A regular expression to pick out characters needing escape from Unicode to
+ * ASCII. A different regular expression is used depending on which version of the JDK is detected - Java 1.4 has
+ * different character support compared with 1.5 and above.
+ * <p/>
+ * This is used by the {@link #escape} method.
+ */
+ private static Pattern pattern;
- static {
- try {
- if (System.getProperty("java.version").indexOf("1.4") >= 0) {
- pattern = Pattern.compile("[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]" +
- "|" +
- "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]");
- } else {
- pattern = Pattern.compile("[\uD800\uDC00-\uDBFF\uDFFF]" +
- "|" +
- "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]");
- }
- }
- catch (Exception e) {
- e.printStackTrace();
- }
- }
+ static {
+ try {
+ if (System.getProperty("java.version").indexOf("1.4") >= 0) {
+ pattern = Pattern.compile("[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]" +
+ "|" +
+ "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]");
+ } else {
+ pattern = Pattern.compile("[\uD800\uDC00-\uDBFF\uDFFF]" +
+ "|" +
+ "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]");
+ }
+ }
+ catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
- /**
- * Base UTF Code point.
- */
- private static final int UTF_BASE_CODEPOINT = 0x10000;
+ /**
+ * Base UTF Code point.
+ */
+ private static final int UTF_BASE_CODEPOINT = 0x10000;
- /**
- * How shift to get UTF-16 to character codes.
- */
- private static final int CHARACTER_CODE_OFFSET = 0x3FF;
+ /**
+ * How shift to get UTF-16 to character codes.
+ */
+ private static final int CHARACTER_CODE_OFFSET = 0x3FF;
- /**
- * How many characters at a time to decode for 8 bit encoding.
- */
- private static final int CHARACTER_LENGTH_8_BIT = 11;
+ /**
+ * How many characters at a time to decode for 8 bit encoding.
+ */
+ private static final int CHARACTER_LENGTH_8_BIT = 11;
- /**
- * How many characters at a time to decode for 16 bit encoding.
- */
- private static final int CHARACTER_LENGTH_16_BIT = 7;
+ /**
+ * How many characters at a time to decode for 16 bit encoding.
+ */
+ private static final int CHARACTER_LENGTH_16_BIT = 7;
- private EscapeUtil() {
+ private EscapeUtil() {
+ }
+
+ /**
+ * Escapes a string literal to a string that is N-Triple escaped.
+ *
+ * @param string a string to escape, never <code>null</code>.
+ * @return a version of the <var>string</var> with N-Triples escapes applied.
+ */
+ public static final String escape(String string) {
+ assert null != string;
+
+ // Obtain a fresh matcher
+ Matcher matcher = pattern.matcher(string);
+
+ // Try to short-circuit the whole process -- maybe nothing needs escaping?
+ if (!matcher.find()) {
+ return string;
}
- /**
- * Escapes a string literal to a string that is N-Triple escaped.
- *
- * @param string a string to escape, never <code>null</code>.
- * @return a version of the <var>string</var> with N-Triples escapes applied.
- */
- public static String escape(String string) {
- assert null != string;
+ // Perform escape character substitutions on each match found by the
+ // matcher, accumulating the escaped text into a stringBuffer
+ StringBuffer stringBuffer = new StringBuffer();
+ do {
+ // The escape text with which to replace the current match
+ String escapeString;
- // Obtain a fresh matcher
- Matcher matcher = pattern.matcher(string);
+ // Depending of the character sequence we're escaping, determine an
+ // appropriate replacement
+ String groupString = matcher.group();
+ switch (groupString.length()) {
+ case 1: // 16-bit characters requiring escaping
+ switch (groupString.charAt(0)) {
+ case '\t': // tab
+ escapeString = "\\\\t";
+ break;
+ case '\n': // newline
+ escapeString = "\\\\n";
+ break;
+ case '\r': // carriage return
+ escapeString = "\\\\r";
+ break;
+ case '"': // quote
+ escapeString = "\\\\\\\"";
+ break;
+ case '\\': // backslash
+ escapeString = "\\\\\\\\";
+ break;
+ default: // other characters use 4-digit hex escapes
+ String hexString = Integer.toHexString(groupString.charAt(0)).toUpperCase();
- // Try to short-circuit the whole process -- maybe nothing needs escaping?
- if (!matcher.find()) {
- return string;
- }
+ escapeString = "\\\\u0000".substring(0, CHARACTER_LENGTH_16_BIT - hexString.length()) + hexString;
- // Perform escape character substitutions on each match found by the
- // matcher, accumulating the escaped text into a stringBuffer
- StringBuffer stringBuffer = new StringBuffer();
- do {
- // The escape text with which to replace the current match
- String escapeString;
+ assert CHARACTER_LENGTH_16_BIT == escapeString.length();
+ assert escapeString.startsWith("\\\\u");
+ break;
+ }
+ break;
- // Depending of the character sequence we're escaping, determine an
- // appropriate replacement
- String groupString = matcher.group();
- switch (groupString.length()) {
- case 1: // 16-bit characters requiring escaping
- switch (groupString.charAt(0)) {
- case '\t': // tab
- escapeString = "\\\\t";
- break;
- case '\n': // newline
- escapeString = "\\\\n";
- break;
- case '\r': // carriage return
- escapeString = "\\\\r";
- break;
- case '"': // quote
- escapeString = "\\\\\\\"";
- break;
- case '\\': // backslash
- escapeString = "\\\\\\\\";
- break;
- default: // other characters use 4-digit hex escapes
- String hexString = Integer.toHexString(groupString.charAt(0)).toUpperCase();
+ case 2: // surrogate pairs are represented as 8-digit hex escapes
+ assert Character.SURROGATE == Character.getType(groupString.charAt(0));
+ assert Character.SURROGATE == Character.getType(groupString.charAt(1));
- escapeString = "\\\\u0000".substring(0, CHARACTER_LENGTH_16_BIT - hexString.length()) +
- hexString;
+ int highSurrogate = ((groupString.charAt(0) & CHARACTER_CODE_OFFSET) << 10);
+ int lowSurrogate = (groupString.charAt(1) & CHARACTER_CODE_OFFSET);
+ String hexString = Integer.toHexString(highSurrogate + lowSurrogate + UTF_BASE_CODEPOINT).
+ toUpperCase();
+ escapeString = "\\\\U00000000".substring(0, CHARACTER_LENGTH_8_BIT - hexString.length()) +
+ hexString;
- assert CHARACTER_LENGTH_16_BIT == escapeString.length();
- assert escapeString.startsWith("\\\\u");
- break;
- }
- break;
+ assert CHARACTER_LENGTH_8_BIT == escapeString.length();
+ assert escapeString.startsWith("\\\\U00") : "Expected a start of \\\\U00, but got " + escapeString;
+ break;
- case 2: // surrogate pairs are represented as 8-digit hex escapes
- assert Character.SURROGATE == Character.getType(groupString.charAt(0));
- assert Character.SURROGATE == Character.getType(groupString.charAt(1));
+ default:
+ throw new Error("Escape sequence " + groupString + " has no handler");
+ }
+ assert null != escapeString;
- int highSurrogate = ((groupString.charAt(0) & CHARACTER_CODE_OFFSET) << 10);
- int lowSurrogate = (groupString.charAt(1) & CHARACTER_CODE_OFFSET);
- String hexString = Integer.toHexString(highSurrogate + lowSurrogate + UTF_BASE_CODEPOINT).
- toUpperCase();
- escapeString = "\\\\U00000000".substring(0, CHARACTER_LENGTH_8_BIT - hexString.length()) +
- hexString;
+ // Having determined an appropriate escapeString, add it to the
+ // stringBuffer
+ matcher.appendReplacement(stringBuffer, escapeString);
+ }
+ while (matcher.find());
- assert CHARACTER_LENGTH_8_BIT == escapeString.length();
- assert escapeString.startsWith("\\\\U00") : "Expected a start of \\\\U00, but got " + escapeString;
- break;
+ // Finish off by appending any remaining text that didn't require escaping,
+ // and return the assembled buffer
+ matcher.appendTail(stringBuffer);
+ return stringBuffer.toString();
+ }
- default:
- throw new Error("Escape sequence " + groupString + " has no handler");
- }
- assert null != escapeString;
- // Having determined an appropriate escapeString, add it to the
- // stringBuffer
- matcher.appendReplacement(stringBuffer, escapeString);
+ /**
+ * Escapes a string which contains a UTF-8 encoding in the internal array of char.
+ * If a UTF-8 encoding is found to be invalid, then this will drop back to
+ * escaping the data as a normal string. Escaping is performed with the NTriples
+ * encoding recommendation:
+ * <a href="http://www.w3.org/TR/2004/REC-rdf-testcases-20040210/#ntrip_strings">§3.2</a>
+ * @param string The string to escape.
+ * @return An escaped version of the string.
+ */
+ public static final String escapeUTF8(String string) {
+ assert null != string;
+
+ // Perform escape character substitutions on each match found by the
+ // matcher, accumulating the escaped text into a stringBuilder
+ StringBuilder buffer = new StringBuilder();
+
+ try {
+ int i = 0;
+ while (i < string.length()) {
+ char c = string.charAt(i);
+ int bytes = getByteCount(c);
+ if (bytes == 4) {
+ int codepoint = getCodepoint(string, i, c);
+ buffer.append(String.format("\\U%08X", codepoint));
+ } else {
+ if (bytes != 1) c = getChar(string, i, bytes, c);
+
+ switch (c) {
+ case 0x9:
+ buffer.append("\\t");
+ break;
+ case 0xA:
+ buffer.append("\\n");
+ break;
+ case 0xD:
+ buffer.append("\\r");
+ break;
+ case 0x22:
+ buffer.append("\\\"");
+ break;
+ case 0x5C:
+ buffer.append("\\\\");
+ break;
+ default:
+ if (c <= 0x1F || c >= 0x7F) {
+ buffer.append(String.format("\\u%04X", (int)c));
+ } else {
+ buffer.append(c);
+ }
+ }
}
- while (matcher.find());
+ i += bytes;
+ }
+
+ return buffer.toString();
+ } catch (Exception e) {
+ // This is not a sequence of UTF-8 characters. Fall back to the old escape algorithm.
+ return escape(string);
+ }
+ }
- // Finish off by appending any remaining text that didn't require escaping,
- // and return the assembled buffer
- matcher.appendTail(stringBuffer);
- return stringBuffer.toString();
+
+ /**
+ * Determine the number of characters in a UTF-8 sequence, based on the start of the sequence.
+ * @param c The first byte from the sequence, held in a char.
+ * @return The number of bytes in the sequence.
+ * @throws IllegalArgumentException If the bit pattern in the character does not represent a valid sequence.
+ */
+ static final int getByteCount(char c) {
+ if ((c & 0xFF80) == 0) return 1;
+ if ((c & 0xFFE0) == 0xC0) return 2;
+ if ((c & 0xFFF0) == 0xE0) return 3;
+ if ((c & 0xFFF8) != 0xF0) throw new IllegalArgumentException("Not a character from a UTF-8 sequence.");
+ return 4;
+ }
+
+
+ /**
+ * Calculate the codepoint (a character that doesn't fit into a char) represented
+ * by a 4 byte UTF-8 encoding.
+ * @param s The string containing the encoding. Each char in the string contains
+ * a single byte from the sequence.
+ * @param offset The start of the 4 byte sequence.
+ * @param startChar The first byte (retrieved as a char) in the sequence.
+ * This is identical to s.charAt(offset) but this was already called
+ * for {@link #getByteCount(char)}, so we reuse it here.
+ * @return The Unicode codepoint represented by the 4 byte sequence.
+ * @throws IllegalArgumentException If the bit pattern in the character does not represent a valid sequence.
+ */
+ static final int getCodepoint(String s, int offset, char startChar) {
+ int secondChar = s.charAt(offset + 1);
+ int thirdChar = s.charAt(offset + 2);
+ int fourthChar = s.charAt(offset + 3);
+
+ // byte sequence is: 11110zzz, 10zzyyyy, 10yyyyxx, 10xxxxxx
+ // check that the trailing bytes all start correctly
+ if ((secondChar & 0xC0) != 0x80 || (thirdChar & 0xC0) != 0x80 || (fourthChar & 0xC0) != 0x80) {
+ throw new IllegalArgumentException("Not a character from a UTF-8 sequence.");
}
+ int x = fourthChar & 0x3F;
+ int yx = thirdChar & 0x3F;
+ int zy = secondChar & 0x3F;
+ int z = (startChar & 0x07) << 2 | zy >> 4;
+ x |= (yx & 0x03) << 6;
+ int y = yx >> 2 | (zy & 0x0F) << 4;
+ return (z << 16) | (y << 8) | x;
+ }
+
+
+ /**
+ * Calculate the character represented by a 2 byte or 3 byte UTF-8 encoding.
+ * @param s The string containing the encoding. Each char in the string contains
+ * a single byte from the sequence.
+ * @param offset The start of the 2 or 3 byte sequence.
+ * @param count The number of bytes in the sequence
+ * (already determined through {@link #getByteCount(char)}).
+ * @param startChar The first byte (retrieved as a char) in the sequence.
+ * This is identical to s.charAt(offset) but this was already called
+ * for {@link #getByteCount(char)}, so we reuse it here.
+ * @return The Unicode character represented by the 2 or 3 byte sequence.
+ */
+ static final char getChar(String s, int offset, int count, char startChar) {
+ assert count == 2 || count == 3;
+ int lastPos = offset + count - 1;
+ int lastChar = s.charAt(lastPos);
+
+ // check that the last byte matches 10xxxxxx
+ if ((lastChar & 0xC0) != 0x80) throw new IllegalArgumentException("Not a character from a UTF-8 sequence.");
+ int x = lastChar & 0x3F;
+ int yx;
+ int y;
+ if (count == 2) {
+ // 2 byte sequence. First byte is 110yyyxx, second is 10xxxxxx
+ yx = startChar & 0x3F;
+ y = yx >> 2;
+ } else {
+ // 3 byte sequence. First byte is 1110yyyy, Second byte is 10yyyyxx
+ int secondChar = s.charAt(offset + 1);
+ // check that second byte starts correctly
+ if ((secondChar & 0xC0) != 0x80) throw new IllegalArgumentException("Not a character from a UTF-8 sequence.");
+ yx = secondChar & 0x3F;
+ y = (yx >> 2) | (startChar & 0x0F) << 4;
+ }
+ x |= (yx & 0x03) << 6;
+ return (char)(y << 8 | x);
+ }
+
}
Modified: trunk/src/jar/query/java/org/jrdf/util/EscapeUtilUnitTest.java
===================================================================
--- trunk/src/jar/query/java/org/jrdf/util/EscapeUtilUnitTest.java 2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/jrdf/util/EscapeUtilUnitTest.java 2009-06-05 04:04:36 UTC (rev 1719)
@@ -37,6 +37,7 @@
testEscapedValue("\\u2260", "\u2260");
testEscapedValue("q", "\u0071");
testEscapedValue("\\u030C", "\u030c");
+ testEscapedValue("\\u00E9", "");
}
public void testControlCharacters() {
Modified: trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImpl.java
===================================================================
--- trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImpl.java 2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImpl.java 2009-06-05 04:04:36 UTC (rev 1719)
@@ -43,7 +43,27 @@
/**
* An RDF literal node.
+ *
+ * Strings for the constructor need to be in an unusual format. Each character in the
+ * string is used to represent a byte in a UTF-8 encoding. This has no effect on
+ * standard ASCII text, but once the characters get above 0x7F then this format no
+ * longer represents the string, but rather the encoded data. This is the format
+ * returned from the ARP parser.
*
+ * For instance, the character (e acute) has a unicode value of 0xE9, and a
+ * UTF-8 encoding of [0xC3, 0xA9]. The literal string used to represent this character
+ * has two characters in it (representing the 2 bytes from the UTF-8 encoding),
+ * specifically Ì (Latin capital letter A with tilde, unicode value 0xC3)
+ * and © (copyright sign, unicode value 0xA9). This is despite the fact that a Java
+ * string can represent e-acute with a single character.
+ *
+ * Previously, this class accepted normal Java strings, and would escape them
+ * correctly when presenting the lexical form. If lexical escaping fails, then it
+ * will fall back to attempting this method. This means that many normal Java strings
+ * will work with this class, but it is still possible to construct a standard string
+ * that could potentially represent a UTF-encoding.
+ *
+ *
* @created 2001-08-13
*
* @author <a href="http://staff.pisoftware.com/raboczi">Simon Raboczi</a>
Modified: trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImplUnitTest.java
===================================================================
--- trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImplUnitTest.java 2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImplUnitTest.java 2009-06-05 04:04:36 UTC (rev 1719)
@@ -169,6 +169,18 @@
(new LiteralImpl("Deseret short ah: \ud801\udc09", "")).getEscapedForm()
);
+ // Test a plain literal with an embedded 4 byte UTF-8 encoding
+ assertEquals(
+ "\"Deseret short ah: \\U00010409\"",
+ (new LiteralImpl("Deseret short ah: \u00f0\u0090\u0090\u0089", "")).getEscapedForm()
+ );
+
+ // Test a plain literal with an embedded 3 byte UTF-8 encoding
+ assertEquals(
+ "\"Devanagari letter i: \\u0907\"",
+ (new LiteralImpl("Devanagari letter i: \u00e0\u00a4\u0087", "")).getEscapedForm()
+ );
+
// Test a plain literal with broken Unicode surrogates -- they should be
// formatted the same way non-ASCII characters are (4-digit hex)
assertEquals(
More information about the Mulgara-svn
mailing list