[Mulgara-svn] r1719 - in trunk/src/jar/query/java/org: jrdf/graph jrdf/util mulgara/query/rdf

Fri Jun 5 04:04:37 UTC 2009

Author: pag
Date: 2009-06-04 21:04:36 -0700 (Thu, 04 Jun 2009)
New Revision: 1719

Modified:
   trunk/src/jar/query/java/org/jrdf/graph/AbstractLiteral.java
   trunk/src/jar/query/java/org/jrdf/util/EscapeUtil.java
   trunk/src/jar/query/java/org/jrdf/util/EscapeUtilUnitTest.java
   trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImpl.java
   trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImplUnitTest.java
Log:
Updated to perform escapes on the UTF-8 encoding that ARP provides. This will fall back to string escapes if a UTF-8 byte sequence is discovered to be invalid

Modified: trunk/src/jar/query/java/org/jrdf/graph/AbstractLiteral.java
===================================================================

--- trunk/src/jar/query/java/org/jrdf/graph/AbstractLiteral.java	2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/jrdf/graph/AbstractLiteral.java	2009-06-05 04:04:36 UTC (rev 1719)
@@ -215,7 +215,7 @@
    * @return this instance in N-Triples format
    */
   public String getEscapedForm() {
-    String escaped = EscapeUtil.escape(getLexicalForm());
+    String escaped = EscapeUtil.escapeUTF8(getLexicalForm());
     return '\"' + escaped + '\"' + appendType();
   }
 

Modified: trunk/src/jar/query/java/org/jrdf/util/EscapeUtil.java
===================================================================
--- trunk/src/jar/query/java/org/jrdf/util/EscapeUtil.java	2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/jrdf/util/EscapeUtil.java	2009-06-05 04:04:36 UTC (rev 1719)
@@ -7,144 +7,293 @@
  * A utility which applies N-Triples escaping.
  *
  * @author Andrew Newman
+ * @author Paul Gearon
  * @version $Revision: 624 $
  */
 public class EscapeUtil {
-    /**
-     * A regular expression to pick out characters needing escape from Unicode to
-     * ASCII.  A different regular expression is used depending on which version of the JDK is detected - Java 1.4 has
-     * different character support compared with 1.5 and above.
-     * <p/>
-     * This is used by the {@link #escape} method.
-     */
-    private static Pattern pattern;
+  /**
+   * A regular expression to pick out characters needing escape from Unicode to
+   * ASCII.  A different regular expression is used depending on which version of the JDK is detected - Java 1.4 has
+   * different character support compared with 1.5 and above.
+   * <p/>
+   * This is used by the {@link #escape} method.
+   */
+  private static Pattern pattern;
 
-    static {
-        try {
-            if (System.getProperty("java.version").indexOf("1.4") >= 0) {
-                pattern = Pattern.compile("[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]" +
-                        "|" +
-                        "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]");
-            } else {
-                pattern = Pattern.compile("[\uD800\uDC00-\uDBFF\uDFFF]" +
-                        "|" +
-                        "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]");
-            }
-        }
-        catch (Exception e) {
-            e.printStackTrace();
-        }
-    }
+  static {
+      try {
+          if (System.getProperty("java.version").indexOf("1.4") >= 0) {
+              pattern = Pattern.compile("[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]" +
+                      "|" +
+                      "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]");
+          } else {
+              pattern = Pattern.compile("[\uD800\uDC00-\uDBFF\uDFFF]" +
+                      "|" +
+                      "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]");
+          }
+      }
+      catch (Exception e) {
+          e.printStackTrace();
+      }
+  }
 
-    /**
-     * Base UTF Code point.
-     */
-    private static final int UTF_BASE_CODEPOINT = 0x10000;
+  /**
+   * Base UTF Code point.
+   */
+  private static final int UTF_BASE_CODEPOINT = 0x10000;
 
-    /**
-     * How shift to get UTF-16 to character codes.
-     */
-    private static final int CHARACTER_CODE_OFFSET = 0x3FF;
+  /**
+   * How shift to get UTF-16 to character codes.
+   */
+  private static final int CHARACTER_CODE_OFFSET = 0x3FF;
 
-    /**
-     * How many characters at a time to decode for 8 bit encoding.
-     */
-    private static final int CHARACTER_LENGTH_8_BIT = 11;
+  /**
+   * How many characters at a time to decode for 8 bit encoding.
+   */
+  private static final int CHARACTER_LENGTH_8_BIT = 11;
 
-    /**
-     * How many characters at a time to decode for 16 bit encoding.
-     */
-    private static final int CHARACTER_LENGTH_16_BIT = 7;
+  /**
+   * How many characters at a time to decode for 16 bit encoding.
+   */
+  private static final int CHARACTER_LENGTH_16_BIT = 7;
 
-    private EscapeUtil() {
+  private EscapeUtil() {
+  }
+
+  /**
+   * Escapes a string literal to a string that is N-Triple escaped.
+   *
+   * @param string a string to escape, never <code>null</code>.
+   * @return a version of the <var>string</var> with N-Triples escapes applied.
+   */
+  public static final String escape(String string) {
+    assert null != string;
+
+    // Obtain a fresh matcher
+    Matcher matcher = pattern.matcher(string);
+
+    // Try to short-circuit the whole process -- maybe nothing needs escaping?
+    if (!matcher.find()) {
+      return string;
     }
 
-    /**
-     * Escapes a string literal to a string that is N-Triple escaped.
-     *
-     * @param string a string to escape, never <code>null</code>.
-     * @return a version of the <var>string</var> with N-Triples escapes applied.
-     */
-    public static String escape(String string) {
-        assert null != string;
+    // Perform escape character substitutions on each match found by the
+    // matcher, accumulating the escaped text into a stringBuffer
+    StringBuffer stringBuffer = new StringBuffer();
+    do {
+      // The escape text with which to replace the current match
+      String escapeString;
 
-        // Obtain a fresh matcher
-        Matcher matcher = pattern.matcher(string);
+      // Depending of the character sequence we're escaping, determine an
+      // appropriate replacement
+      String groupString = matcher.group();
+      switch (groupString.length()) {
+        case 1: // 16-bit characters requiring escaping
+          switch (groupString.charAt(0)) {
+            case '\t': // tab
+              escapeString = "\\\\t";
+              break;
+            case '\n': // newline
+              escapeString = "\\\\n";
+              break;
+            case '\r': // carriage return
+              escapeString = "\\\\r";
+              break;
+            case '"':  // quote
+              escapeString = "\\\\\\\"";
+              break;
+            case '\\': // backslash
+              escapeString = "\\\\\\\\";
+              break;
+            default:   // other characters use 4-digit hex escapes
+              String hexString = Integer.toHexString(groupString.charAt(0)).toUpperCase();
 
-        // Try to short-circuit the whole process -- maybe nothing needs escaping?
-        if (!matcher.find()) {
-            return string;
-        }
+              escapeString = "\\\\u0000".substring(0, CHARACTER_LENGTH_16_BIT - hexString.length()) + hexString;
 
-        // Perform escape character substitutions on each match found by the
-        // matcher, accumulating the escaped text into a stringBuffer
-        StringBuffer stringBuffer = new StringBuffer();
-        do {
-            // The escape text with which to replace the current match
-            String escapeString;
+              assert CHARACTER_LENGTH_16_BIT == escapeString.length();
+              assert escapeString.startsWith("\\\\u");
+              break;
+          }
+          break;
 
-            // Depending of the character sequence we're escaping, determine an
-            // appropriate replacement
-            String groupString = matcher.group();
-            switch (groupString.length()) {
-                case 1: // 16-bit characters requiring escaping
-                    switch (groupString.charAt(0)) {
-                        case '\t': // tab
-                            escapeString = "\\\\t";
-                            break;
-                        case '\n': // newline
-                            escapeString = "\\\\n";
-                            break;
-                        case '\r': // carriage return
-                            escapeString = "\\\\r";
-                            break;
-                        case '"':  // quote
-                            escapeString = "\\\\\\\"";
-                            break;
-                        case '\\': // backslash
-                            escapeString = "\\\\\\\\";
-                            break;
-                        default:   // other characters use 4-digit hex escapes
-                            String hexString = Integer.toHexString(groupString.charAt(0)).toUpperCase();
+        case 2: // surrogate pairs are represented as 8-digit hex escapes
+          assert Character.SURROGATE == Character.getType(groupString.charAt(0));
+          assert Character.SURROGATE == Character.getType(groupString.charAt(1));
 
-                            escapeString = "\\\\u0000".substring(0, CHARACTER_LENGTH_16_BIT - hexString.length()) +
-                                    hexString;
+          int highSurrogate = ((groupString.charAt(0) & CHARACTER_CODE_OFFSET) << 10);
+          int lowSurrogate = (groupString.charAt(1) & CHARACTER_CODE_OFFSET);
+          String hexString = Integer.toHexString(highSurrogate + lowSurrogate + UTF_BASE_CODEPOINT).
+                  toUpperCase();
+          escapeString = "\\\\U00000000".substring(0, CHARACTER_LENGTH_8_BIT - hexString.length()) +
+                  hexString;
 
-                            assert CHARACTER_LENGTH_16_BIT == escapeString.length();
-                            assert escapeString.startsWith("\\\\u");
-                            break;
-                    }
-                    break;
+          assert CHARACTER_LENGTH_8_BIT == escapeString.length();
+          assert escapeString.startsWith("\\\\U00") : "Expected a start of \\\\U00, but got " + escapeString;
+          break;
 
-                case 2: // surrogate pairs are represented as 8-digit hex escapes
-                    assert Character.SURROGATE == Character.getType(groupString.charAt(0));
-                    assert Character.SURROGATE == Character.getType(groupString.charAt(1));
+        default:
+          throw new Error("Escape sequence " + groupString + " has no handler");
+      }
+      assert null != escapeString;
 
-                    int highSurrogate = ((groupString.charAt(0) & CHARACTER_CODE_OFFSET) << 10);
-                    int lowSurrogate = (groupString.charAt(1) & CHARACTER_CODE_OFFSET);
-                    String hexString = Integer.toHexString(highSurrogate + lowSurrogate + UTF_BASE_CODEPOINT).
-                            toUpperCase();
-                    escapeString = "\\\\U00000000".substring(0, CHARACTER_LENGTH_8_BIT - hexString.length()) +
-                            hexString;
+      // Having determined an appropriate escapeString, add it to the
+      // stringBuffer
+      matcher.appendReplacement(stringBuffer, escapeString);
+    }
+    while (matcher.find());
 
-                    assert CHARACTER_LENGTH_8_BIT == escapeString.length();
-                    assert escapeString.startsWith("\\\\U00") : "Expected a start of \\\\U00, but got " + escapeString;
-                    break;
+    // Finish off by appending any remaining text that didn't require escaping,
+    // and return the assembled buffer
+    matcher.appendTail(stringBuffer);
+    return stringBuffer.toString();
+  }
 
-                default:
-                    throw new Error("Escape sequence " + groupString + " has no handler");
-            }
-            assert null != escapeString;
 
-            // Having determined an appropriate escapeString, add it to the
-            // stringBuffer
-            matcher.appendReplacement(stringBuffer, escapeString);
+  /**
+   * Escapes a string which contains a UTF-8 encoding in the internal array of char.
+   * If a UTF-8 encoding is found to be invalid, then this will drop back to
+   * escaping the data as a normal string. Escaping is performed with the NTriples
+   * encoding recommendation:
+   * <a href="http://www.w3.org/TR/2004/REC-rdf-testcases-20040210/#ntrip_strings">&sect;3.2</a>
+   * @param string The string to escape.
+   * @return An escaped version of the string.
+   */
+  public static final String escapeUTF8(String string) {
+    assert null != string;
+
+    // Perform escape character substitutions on each match found by the
+    // matcher, accumulating the escaped text into a stringBuilder
+    StringBuilder buffer = new StringBuilder();
+
+    try {
+      int i = 0;
+      while (i < string.length()) {
+        char c = string.charAt(i);
+        int bytes = getByteCount(c);
+        if (bytes == 4) {
+          int codepoint = getCodepoint(string, i, c);
+          buffer.append(String.format("\\U%08X", codepoint));
+        } else {
+          if (bytes != 1) c = getChar(string, i, bytes, c);
+  
+          switch (c) {
+            case 0x9:
+              buffer.append("\\t");
+              break;
+            case 0xA:
+              buffer.append("\\n");
+              break;
+            case 0xD:
+              buffer.append("\\r");
+              break;
+            case 0x22:
+              buffer.append("\\\"");
+              break;
+            case 0x5C:
+              buffer.append("\\\\");
+              break;
+            default:
+              if (c <= 0x1F || c >= 0x7F) {
+                buffer.append(String.format("\\u%04X", (int)c));
+              } else {
+                buffer.append(c);
+              }
+          }
         }
-        while (matcher.find());
+        i += bytes;
+      }
+  
+      return buffer.toString();
+    } catch (Exception e) {
+      // This is not a sequence of UTF-8 characters. Fall back to the old escape algorithm.
+      return escape(string);
+    }
+  }
 
-        // Finish off by appending any remaining text that didn't require escaping,
-        // and return the assembled buffer
-        matcher.appendTail(stringBuffer);
-        return stringBuffer.toString();
+
+  /**
+   * Determine the number of characters in a UTF-8 sequence, based on the start of the sequence.
+   * @param c The first byte from the sequence, held in a char.
+   * @return The number of bytes in the sequence.
+   * @throws IllegalArgumentException If the bit pattern in the character does not represent a valid sequence.
+   */
+  static final int getByteCount(char c) {
+    if ((c & 0xFF80) == 0) return 1;
+    if ((c & 0xFFE0) == 0xC0) return 2;
+    if ((c & 0xFFF0) == 0xE0) return 3;
+    if ((c & 0xFFF8) != 0xF0) throw new IllegalArgumentException("Not a character from a UTF-8 sequence.");
+    return 4;
+  }
+
+
+  /**
+   * Calculate the codepoint (a character that doesn't fit into a char) represented
+   * by a 4 byte UTF-8 encoding.
+   * @param s The string containing the encoding. Each char in the string contains
+   *        a single byte from the sequence.
+   * @param offset The start of the 4 byte sequence.
+   * @param startChar The first byte (retrieved as a char) in the sequence.
+   *        This is identical to s.charAt(offset) but this was already called
+   *        for {@link #getByteCount(char)}, so we reuse it here.
+   * @return The Unicode codepoint represented by the 4 byte sequence.
+   * @throws IllegalArgumentException If the bit pattern in the character does not represent a valid sequence.
+   */
+  static final int getCodepoint(String s, int offset, char startChar) {
+    int secondChar = s.charAt(offset + 1);
+    int thirdChar = s.charAt(offset + 2);
+    int fourthChar = s.charAt(offset + 3);
+
+    // byte sequence is: 11110zzz, 10zzyyyy, 10yyyyxx, 10xxxxxx
+    // check that the trailing bytes all start correctly
+    if ((secondChar & 0xC0) != 0x80 || (thirdChar & 0xC0) != 0x80 || (fourthChar & 0xC0) != 0x80) {
+      throw new IllegalArgumentException("Not a character from a UTF-8 sequence.");
     }
+    int x = fourthChar & 0x3F;
+    int yx = thirdChar & 0x3F;
+    int zy = secondChar & 0x3F;
+    int z = (startChar & 0x07) << 2 | zy >> 4;
+    x |= (yx & 0x03) << 6;
+    int y = yx >> 2 | (zy & 0x0F) << 4;
+    return (z << 16) | (y << 8) | x;
+  }
+
+
+  /**
+   * Calculate the character represented by a 2 byte or 3 byte UTF-8 encoding.
+   * @param s The string containing the encoding. Each char in the string contains
+   *        a single byte from the sequence.
+   * @param offset The start of the 2 or 3 byte sequence.
+   * @param count The number of bytes in the sequence
+   *        (already determined through {@link #getByteCount(char)}).
+   * @param startChar The first byte (retrieved as a char) in the sequence.
+   *        This is identical to s.charAt(offset) but this was already called
+   *        for {@link #getByteCount(char)}, so we reuse it here.
+   * @return The Unicode character represented by the 2 or 3 byte sequence.
+   */
+  static final char getChar(String s, int offset, int count, char startChar) {
+    assert count == 2 || count == 3;
+    int lastPos = offset + count - 1;
+    int lastChar = s.charAt(lastPos);
+
+    // check that the last byte matches 10xxxxxx
+    if ((lastChar & 0xC0) != 0x80) throw new IllegalArgumentException("Not a character from a UTF-8 sequence.");
+    int x = lastChar & 0x3F;
+    int yx;
+    int y;
+    if (count == 2) {
+      // 2 byte sequence. First byte is 110yyyxx, second is 10xxxxxx
+      yx = startChar & 0x3F;
+      y = yx >> 2;
+    } else {
+      // 3 byte sequence. First byte is 1110yyyy, Second byte is 10yyyyxx
+      int secondChar = s.charAt(offset + 1);
+      // check that second byte starts correctly 
+      if ((secondChar & 0xC0) != 0x80) throw new IllegalArgumentException("Not a character from a UTF-8 sequence.");
+      yx = secondChar & 0x3F;
+      y = (yx >> 2) | (startChar & 0x0F) << 4;
+    }
+    x |= (yx & 0x03) << 6;
+    return (char)(y << 8 | x);
+  }
+
 }

Modified: trunk/src/jar/query/java/org/jrdf/util/EscapeUtilUnitTest.java
===================================================================
--- trunk/src/jar/query/java/org/jrdf/util/EscapeUtilUnitTest.java	2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/jrdf/util/EscapeUtilUnitTest.java	2009-06-05 04:04:36 UTC (rev 1719)
@@ -37,6 +37,7 @@
         testEscapedValue("\\u2260", "\u2260");
         testEscapedValue("q", "\u0071");
         testEscapedValue("\\u030C", "\u030c");
+        testEscapedValue("\\u00E9", "Ž");
     }
 
     public void testControlCharacters() {

Modified: trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImpl.java
===================================================================
--- trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImpl.java	2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImpl.java	2009-06-05 04:04:36 UTC (rev 1719)
@@ -43,7 +43,27 @@
 
 /**
  * An RDF literal node.
+ * 
+ * Strings for the constructor need to be in an unusual format. Each character in the
+ * string is used to represent a byte in a UTF-8 encoding. This has no effect on
+ * standard ASCII text, but once the characters get above 0x7F then this format no
+ * longer represents the string, but rather the encoded data. This is the format
+ * returned from the ARP parser.
  *
+ * For instance, the character Ž (e acute) has a unicode value of 0xE9, and a
+ * UTF-8 encoding of [0xC3, 0xA9]. The literal string used to represent this character
+ * has two characters in it (representing the 2 bytes from the UTF-8 encoding),
+ * specifically Ì (Latin capital letter A with tilde, unicode value 0xC3)
+ * and © (copyright sign, unicode value 0xA9). This is despite the fact that a Java
+ * string can represent e-acute with a single character.
+ * 
+ * Previously, this class accepted normal Java strings, and would escape them
+ * correctly when presenting the lexical form. If lexical escaping fails, then it
+ * will fall back to attempting this method. This means that many normal Java strings
+ * will work with this class, but it is still possible to construct a standard string
+ * that could potentially represent a UTF-encoding.
+ * 
+ *
  * @created 2001-08-13
  *
  * @author <a href="http://staff.pisoftware.com/raboczi">Simon Raboczi</a>

Modified: trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImplUnitTest.java
===================================================================
--- trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImplUnitTest.java	2009-06-04 21:33:37 UTC (rev 1718)
+++ trunk/src/jar/query/java/org/mulgara/query/rdf/LiteralImplUnitTest.java	2009-06-05 04:04:36 UTC (rev 1719)
@@ -169,6 +169,18 @@
         (new LiteralImpl("Deseret short ah: \ud801\udc09", "")).getEscapedForm()
         );
 
+    // Test a plain literal with an embedded 4 byte UTF-8 encoding
+    assertEquals(
+        "\"Deseret short ah: \\U00010409\"",
+        (new LiteralImpl("Deseret short ah: \u00f0\u0090\u0090\u0089", "")).getEscapedForm()
+        );
+
+    // Test a plain literal with an embedded 3 byte UTF-8 encoding
+    assertEquals(
+        "\"Devanagari letter i: \\u0907\"",
+        (new LiteralImpl("Devanagari letter i: \u00e0\u00a4\u0087", "")).getEscapedForm()
+        );
+
     // Test a plain literal with broken Unicode surrogates -- they should be
     // formatted the same way non-ASCII characters are (4-digit hex)
     assertEquals(