[Mulgara-svn] r1446 - in trunk: jxdata/iTQL/fulltext_queries src/jar/resolver-lucene/java/org/mulgara/resolver/lucene

ronald at mulgara.org ronald at mulgara.org
Wed Jan 21 12:44:51 UTC 2009


Author: ronald
Date: 2009-01-21 04:44:47 -0800 (Wed, 21 Jan 2009)
New Revision: 1446

Modified:
   trunk/jxdata/iTQL/fulltext_queries/queryResult17.txt
   trunk/jxdata/iTQL/fulltext_queries/queryResult18.txt
   trunk/jxdata/iTQL/fulltext_queries/queryResult19.txt
   trunk/jxdata/iTQL/fulltext_queries/test.jxu
   trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndex.java
   trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndexTuples.java
   trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/LuceneResolver.java
Log:
Changed LuceneResolver to lazy-evaluate the result (FullTextStringIndexTuples).
This allows other parts of the query to produce results first which are then
used to limit the search results.

Modified: trunk/jxdata/iTQL/fulltext_queries/queryResult17.txt
===================================================================
--- trunk/jxdata/iTQL/fulltext_queries/queryResult17.txt	2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/jxdata/iTQL/fulltext_queries/queryResult17.txt	2009-01-21 12:44:47 UTC (rev 1446)
@@ -1,2 +1,2 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><score/></variables><solution><pmid resource="urn:pmid:11076292"/><title>[The role of immunohistochemical methods for determining the type of treatment and prognosis of tumoral diseases]</title><score datatype="http://www.w3.org/2001/XMLSchema#double">1.67</score></solution><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><score datatype="http://www.w3.org/2001/XMLSchema#double">1.38</score></solution></query></answer>
+<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><score/></variables><solution><pmid resource="urn:pmid:11076292"/><title>[The role of immunohistochemical methods for determining the type of treatment and prognosis of tumoral diseases]</title><score datatype="http://www.w3.org/2001/XMLSchema#double">1.50</score></solution><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><score datatype="http://www.w3.org/2001/XMLSchema#double">1.20</score></solution></query></answer>

Modified: trunk/jxdata/iTQL/fulltext_queries/queryResult18.txt
===================================================================
--- trunk/jxdata/iTQL/fulltext_queries/queryResult18.txt	2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/jxdata/iTQL/fulltext_queries/queryResult18.txt	2009-01-21 12:44:47 UTC (rev 1446)
@@ -1,2 +1,2 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><sc1/><sc2/></variables><solution><pmid resource="urn:pmid:11076292"/><title>[The role of immunohistochemical methods for determining the type of treatment and prognosis of tumoral diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.67</sc1></solution><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.38</sc1></solution><solution><pmid resource="urn:pmid:11244589"/><title>[Cholelithiasis in heart transplant patients]</title><sc2 datatype="http://www.w3.org/2001/XMLSchema#double">2.75</sc2></solution></query></answer>
+<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><sc1/><sc2/></variables><solution><pmid resource="urn:pmid:11076292"/><title>[The role of immunohistochemical methods for determining the type of treatment and prognosis of tumoral diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.50</sc1></solution><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.20</sc1></solution><solution><pmid resource="urn:pmid:11244589"/><title>[Cholelithiasis in heart transplant patients]</title><sc2 datatype="http://www.w3.org/2001/XMLSchema#double">2.61</sc2></solution></query></answer>

Modified: trunk/jxdata/iTQL/fulltext_queries/queryResult19.txt
===================================================================
--- trunk/jxdata/iTQL/fulltext_queries/queryResult19.txt	2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/jxdata/iTQL/fulltext_queries/queryResult19.txt	2009-01-21 12:44:47 UTC (rev 1446)
@@ -1,2 +1,2 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><sc1/><sc2/></variables><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.38</sc1><sc2 datatype="http://www.w3.org/2001/XMLSchema#double">1.04</sc2></solution></query></answer>
+<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><sc1/><sc2/></variables><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.20</sc1><sc2 datatype="http://www.w3.org/2001/XMLSchema#double">0.92</sc2></solution></query></answer>

Modified: trunk/jxdata/iTQL/fulltext_queries/test.jxu
===================================================================
--- trunk/jxdata/iTQL/fulltext_queries/test.jxu	2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/jxdata/iTQL/fulltext_queries/test.jxu	2009-01-21 12:44:47 UTC (rev 1446)
@@ -314,8 +314,8 @@
            order by $pmid $title;"/>
 
   <eval stepClass="org.mulgara.store.jxunit.QueryJX" />
-  <subst name="queryResult" regexp="1\.67[0-9]*" value="1.67"/>
-  <subst name="queryResult" regexp="1\.38[0-9]*" value="1.38"/>
+  <subst name="queryResult" regexp="1\.(50|51)[0-9]*" value="1.50"/>
+  <subst name="queryResult" regexp="1\.(20|21)[0-9]*" value="1.20"/>
   <ifEqual converse="true" file="queryResult17.txt" name="queryResult">
     <save name="queryResult" file="badQuery17Result.xml"/>
     <fail>Output failed.  Check badQuery17Result.xml for output.</fail>
@@ -334,9 +334,9 @@
            order by $pmid $title;"/>
 
   <eval stepClass="org.mulgara.store.jxunit.QueryJX" />
-  <subst name="queryResult" regexp="1\.67[0-9]*" value="1.67"/>
-  <subst name="queryResult" regexp="1\.38[0-9]*" value="1.38"/>
-  <subst name="queryResult" regexp="2\.75[0-9]*" value="2.75"/>
+  <subst name="queryResult" regexp="1\.(50|51)[0-9]*" value="1.50"/>
+  <subst name="queryResult" regexp="1\.(20|21)[0-9]*" value="1.20"/>
+  <subst name="queryResult" regexp="2\.(61|62)[0-9]*" value="2.61"/>
   <ifEqual converse="true" file="queryResult18.txt" name="queryResult">
     <save name="queryResult" file="badQuery18Result.xml"/>
     <fail>Output failed.  Check badQuery18Result.xml for output.</fail>
@@ -355,8 +355,8 @@
            order by $pmid $title;"/>
 
   <eval stepClass="org.mulgara.store.jxunit.QueryJX" />
-  <subst name="queryResult" regexp="1\.38[0-9]*" value="1.38"/>
-  <subst name="queryResult" regexp="1\.04[0-9]*" value="1.04"/>
+  <subst name="queryResult" regexp="1\.(20|21)[0-9]*" value="1.20"/>
+  <subst name="queryResult" regexp="0\.(92|93)[0-9]*" value="0.92"/>
   <ifEqual converse="true" file="queryResult19.txt" name="queryResult">
     <save name="queryResult" file="badQuery19Result.xml"/>
     <fail>Output failed.  Check badQuery19Result.xml for output.</fail>

Modified: trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndex.java
===================================================================
--- trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndex.java	2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndex.java	2009-01-21 12:44:47 UTC (rev 1446)
@@ -53,6 +53,7 @@
 import org.apache.lucene.search.HitCollector;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TermQuery;
@@ -561,13 +562,15 @@
                      " predicate :" + predicate + " literal :" + literal);
       }
 
-      if ((subject != null) && (subject.length() > 0)) {
+      if (subject != null) {
         TermQuery tSubject = new TermQuery(new Term(SUBJECT_KEY, subject));
+        tSubject.setBoost(0);
         bQuery.add(tSubject, BooleanClause.Occur.MUST);
       }
 
-      if ((predicate != null) && (predicate.length() > 0)) {
+      if (predicate != null) {
         TermQuery tPredicate = new TermQuery(new Term(PREDICATE_KEY, predicate));
+        tPredicate.setBoost(0);
         bQuery.add(tPredicate, BooleanClause.Occur.MUST);
       }
 
@@ -652,6 +655,64 @@
   }
 
   /**
+   * The maximum number of documents the given query could return.
+   *
+   * @param subject   the subject; may be null
+   * @param predicate the predicate; may be null
+   * @param literal   literal to be searched via the analyzer; may be null
+   * @return the maximum number of documents
+   * @throws FullTextStringIndexException if an error occurred
+   */
+  public long getMaxDocs(String subject, String predicate, String object)
+      throws FullTextStringIndexException {
+    long total = -1;
+
+    try {
+      if (subject != null) {
+        total = indexSearcher.docFreq(new Term(SUBJECT_KEY, subject));
+        if (total == 0) return 0;
+      }
+
+      if (predicate != null) {
+        if (total > 0)
+          total = Math.min(indexSearcher.docFreq(new Term(PREDICATE_KEY, subject)), total);
+        if (total == 0) return 0;
+      }
+
+      if (object != null) {
+        QueryParser parser = new QueryParser(LITERAL_KEY, analyzer);
+        total = findMinDocCount(parser.parse(object), total);
+      }
+
+      return (total >= 0) ? total : indexSearcher.maxDoc();
+    } catch (IOException ioe) {
+      closeIndexers = true;
+      throw new FullTextStringIndexException("Unable to count results for query '" + object + "'", ioe);
+    } catch (ParseException pe) {
+      throw new FullTextStringIndexException("Unable to parse query '" + object + "'", pe);
+    }
+  }
+
+  private long findMinDocCount(Query q, long max) throws IOException {
+    long count = max;
+
+    if (q instanceof TermQuery) {
+      Term term = ((TermQuery)q).getTerm();
+      count = Math.min(indexSearcher.docFreq(term), count);
+    } else if (q instanceof BooleanQuery) {
+      for (BooleanClause clause : ((BooleanQuery)q).getClauses()) {
+        if (clause.isRequired()) count = findMinDocCount(clause.getQuery(), count);
+      }
+    } else if (q instanceof PhraseQuery) {
+      for (Term term : ((PhraseQuery)q).getTerms()) {
+        count = Math.min(indexSearcher.docFreq(term), count);
+      }
+    }
+
+    return count;
+  }
+
+  /**
    * Acquire the indexers.
    *
    * @param forWrites whether to acquire an index writer

Modified: trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndexTuples.java
===================================================================
--- trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndexTuples.java	2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndexTuples.java	2009-01-21 12:44:47 UTC (rev 1446)
@@ -40,6 +40,8 @@
 import org.apache.lucene.document.Document;
 
 // JRDf
+import org.jrdf.graph.BlankNode;
+import org.jrdf.graph.Literal;
 import org.jrdf.graph.URIReference;
 
 // local packages
@@ -82,9 +84,7 @@
   /** Logger.  */
   private final static Logger logger = Logger.getLogger(FullTextStringIndexTuples.class);
 
-  /**
-   * The native Lucene query result to represent as a {@link Tuples}.
-   */
+  /** The native Lucene query result to represent as a {@link Tuples}. */
   private FullTextStringIndex.Hits hits;
 
   /**
@@ -94,26 +94,28 @@
    */
   private Document document;
 
-  /**
-   * The index of the next {@link #document} within the {@link #hits}.
-   */
+  /** The index of the next {@link #document} within the {@link #hits}. */
   private int nextDocumentIndex = 0;
 
-  /**
-   * Session used to localize Lucene text into string pool nodes.
-   */
-  private ResolverSession session;
+  /** Session used to localize Lucene text into string pool nodes. */
+  private final ResolverSession session;
 
-  /**
-   * The number of items in to tuples
-   */
+  /** The number of items in tuples */
   private long rowCount = -1;
 
+  /** The upper bound on the number of items in tuples */
+  private long rowUpperBound = -1;
+
   private final List<Variable> variableList = new ArrayList<Variable>(3);
   private final List<String> luceneKeyList = new ArrayList<String>(3);
 
-  private LuceneConstraint constraint;
+  private final FullTextStringIndex fullTextStringIndex;
+  private final LuceneConstraint constraint;
 
+  private final ConstraintElement subjectElement;
+  private final ConstraintElement predicateElement;
+  private final String            object;
+
   //
   // Constructor
   //
@@ -133,49 +135,31 @@
    */
   FullTextStringIndexTuples(FullTextStringIndex fullTextStringIndex,
       LuceneConstraint constraint, ResolverSession session) throws QueryException {
+    this.fullTextStringIndex = fullTextStringIndex;
     this.session = session;
     this.constraint = constraint;
 
     try {
-      // Validate and globalize subject
-      String subject = null;
-      ConstraintElement subjectElement = constraint.getSubject();
+      // process subject
+      subjectElement = constraint.getSubject();
 
       if (subjectElement instanceof Variable) {
         variableList.add((Variable)subjectElement);
         luceneKeyList.add(FullTextStringIndex.SUBJECT_KEY);
-      } else if (subjectElement instanceof LocalNode) {
-        try {
-          URIReference subjectURI = (URIReference) session.globalize(((
-              LocalNode) subjectElement).getValue());
-          subject = subjectURI.getURI().toString();
-        } catch (ClassCastException ec) {
-          throw new QueryException("Bad subject in Lucene constraint", ec);
-        }
       }
 
-      // Validate and globalize predicate
-      String predicate = null;
-      ConstraintElement predicateElement = constraint.getPredicate();
+      // process predicate
+      predicateElement = constraint.getPredicate();
+
       if (predicateElement instanceof Variable) {
         variableList.add((Variable)predicateElement);
         luceneKeyList.add(FullTextStringIndex.PREDICATE_KEY);
-      } else if (predicateElement instanceof LocalNode) {
-        try {
-          URIReference predicateURI = (URIReference) session.globalize(((
-              LocalNode) predicateElement).getValue());
-          predicate = predicateURI.getURI().toString();
-        } catch (ClassCastException ec) {
-          throw new QueryException("Bad predicate in Lucene constraint", ec);
-        }
       }
 
-      // Validate and globalize object
-      String object;
+      // process object
       ConstraintElement objectElement = constraint.getObject();
       try {
-        LiteralImpl objectLiteral = (LiteralImpl) session.globalize(((LocalNode)
-            objectElement).getValue());
+        LiteralImpl objectLiteral = (LiteralImpl)session.globalize(((LocalNode)objectElement).getValue());
         object = objectLiteral.getLexicalForm();
       } catch (ClassCastException e) {
         throw new QueryException("The object of any rdf:object statement in a mulgara:LuceneModel " +
@@ -188,16 +172,9 @@
         variableList.add(score);
       }
 
-      if (logger.isInfoEnabled()) {
-        logger.info("Searching for " + subject + " : " + predicate + " : " + object);
-      }
-      // Initialize fields
-      hits = fullTextStringIndex.find(subject, predicate, object);
       setVariables(variableList);
     } catch (GlobalizeException e) {
       throw new QueryException("Couldn't globalize constraint elements", e);
-    } catch (FullTextStringIndexException e) {
-      throw new QueryException("Couldn't generate answer from text index", e);
     }
   }
 
@@ -205,15 +182,49 @@
   // Implementation of AbstractTuples methods
   //
 
-  public void beforeFirst(long[] prefix,
-      int suffixTruncation) throws TuplesException {
+  public void beforeFirst(long[] prefix, int suffixTruncation) throws TuplesException {
+    String subject = getString(subjectElement, prefix.length > 0 ? prefix[0] : 0);
+    String predicate = getString(predicateElement, prefix.length > 1 ? prefix[1] : 0);
+
+    if (logger.isDebugEnabled()) {
+      logger.debug("Searching for " + subject + " : " + predicate + " : " + object);
+    }
+
+    try {
+      hits = fullTextStringIndex.find(subject, predicate, object);
+    } catch (FullTextStringIndexException e) {
+      throw new TuplesException("Couldn't generate answer from text index: subject='" + subject +
+                                "', predicate='" + predicate + "', object='" + object + "'", e);
+    }
+
+    //Tuples tuples = TuplesOperations.sort(tmpTuples);
+
     document = null;
     nextDocumentIndex = 0;
+    rowCount = -1;
+    rowUpperBound = -1;
   }
 
+  private String getString(ConstraintElement ce, long boundVal) throws TuplesException {
+    if (ce instanceof LocalNode) boundVal = ((LocalNode)ce).getValue();
+
+    if (boundVal == 0) return null;
+
+    try {
+      Object val =  session.globalize(boundVal);
+      if (val instanceof URIReference) return ((URIReference)val).getURI().toString();
+      if (val instanceof Literal) return ((Literal)val).getLexicalForm();
+      if (val instanceof BlankNode) return "";
+
+      throw new TuplesException("Unknown node-type for Lucene constraint '" + ce + "': local-value=" + boundVal + ", global-value=" + val + ", class=" + val.getClass());
+    } catch (GlobalizeException e) {
+      throw new TuplesException("Couldn't globalize value " + boundVal, e);
+    }
+  }
+
   public void close() throws TuplesException {
     try {
-      hits.close();
+      if (hits != null) hits.close();
     } catch (IOException ioe) {
       throw new TuplesException("Error closing fulltext index hits", ioe);
     }
@@ -221,7 +232,7 @@
 
   public FullTextStringIndexTuples clone() {
     FullTextStringIndexTuples clone = (FullTextStringIndexTuples) super.clone();
-    clone.hits = hits.clone();
+    if (hits != null) clone.hits = hits.clone();
     return clone;
   }
 
@@ -239,8 +250,7 @@
     } catch (IOException e) {
       throw new TuplesException("Couldn't get column " + column + " value", e);
     } catch (LocalizeException e) {
-      throw new TuplesException("Couldn't localize column " + column + " value",
-          e);
+      throw new TuplesException("Couldn't localize column " + column + " value", e);
     } catch (URISyntaxException e) {
       throw new TuplesException("Couldn't get column " + column + " value", e);
     }
@@ -255,9 +265,39 @@
   }
 
   public long getRowUpperBound() throws TuplesException {
-    return getRowCount();
+    if (rowUpperBound == -1) {
+      try {
+        rowUpperBound = (hits != null) ? getRowCount() :
+            fullTextStringIndex.getMaxDocs(getString(subjectElement, 0), getString(predicateElement, 0), object);
+      } catch (FullTextStringIndexException e) {
+        throw new TuplesException("Couldn't row upper-bound from text index: subject='" +
+                                  getString(subjectElement, 0) + "', predicate='" +
+                                  getString(predicateElement, 0) + "', object='" + object + "'", e);
+      }
+    }
+
+    return rowUpperBound;
   }
 
+  public int getRowCardinality() throws TuplesException {
+    long bound = getRowUpperBound();
+
+    if (bound == 0) return Tuples.ZERO;
+    if (bound == 1) return Tuples.ONE;
+    return Tuples.MANY;
+
+    /* Exact, but slower
+    if (getRowUpperBound() == 0) return Tuples.ZERO;
+
+    if (hits == null) beforeFirst();
+
+    long count = getRowCount();
+    if (count == 0) return Tuples.ZERO;
+    if (count == 1) return Tuples.ONE;
+    return Tuples.MANY;
+    */
+  }
+
   /**
    * Lucene never generates unbound columns.
    *
@@ -276,6 +316,8 @@
   }
 
   public boolean next() throws TuplesException {
+    assert hits != null : "next() called without beforeFirst()";
+
     try {
       if (nextDocumentIndex < getRowCount()) {
         document = hits.doc(nextDocumentIndex++);

Modified: trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/LuceneResolver.java
===================================================================
--- trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/LuceneResolver.java	2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/LuceneResolver.java	2009-01-21 12:44:47 UTC (rev 1446)
@@ -372,13 +372,7 @@
     // generate the tuples
     try {
       FullTextStringIndex stringIndex = getFullTextStringIndex(((LocalNode)modelElement).getValue());
-      Tuples tmpTuples = new FullTextStringIndexTuples(stringIndex, (LuceneConstraint) constraint, resolverSession);
-      Tuples tuples = TuplesOperations.sort(tmpTuples);
-      tmpTuples.close();
-
-      return new TuplesWrapperResolution(tuples, constraint);
-    } catch (TuplesException te) {
-      throw new QueryException("Failed to sort tuples and close", te);
+      return new FullTextStringIndexTuples(stringIndex, (LuceneConstraint)constraint, resolverSession);
     } catch (IOException ioe) {
       throw new QueryException("Failed to open string index", ioe);
     } catch (FullTextStringIndexException ef) {




More information about the Mulgara-svn mailing list