[Mulgara-svn] r1446 - in trunk: jxdata/iTQL/fulltext_queries src/jar/resolver-lucene/java/org/mulgara/resolver/lucene
ronald at mulgara.org
ronald at mulgara.org
Wed Jan 21 12:44:51 UTC 2009
Author: ronald
Date: 2009-01-21 04:44:47 -0800 (Wed, 21 Jan 2009)
New Revision: 1446
Modified:
trunk/jxdata/iTQL/fulltext_queries/queryResult17.txt
trunk/jxdata/iTQL/fulltext_queries/queryResult18.txt
trunk/jxdata/iTQL/fulltext_queries/queryResult19.txt
trunk/jxdata/iTQL/fulltext_queries/test.jxu
trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndex.java
trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndexTuples.java
trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/LuceneResolver.java
Log:
Changed LuceneResolver to lazy-evaluate the result (FullTextStringIndexTuples).
This allows other parts of the query to produce results first which are then
used to limit the search results.
Modified: trunk/jxdata/iTQL/fulltext_queries/queryResult17.txt
===================================================================
--- trunk/jxdata/iTQL/fulltext_queries/queryResult17.txt 2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/jxdata/iTQL/fulltext_queries/queryResult17.txt 2009-01-21 12:44:47 UTC (rev 1446)
@@ -1,2 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
-<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><score/></variables><solution><pmid resource="urn:pmid:11076292"/><title>[The role of immunohistochemical methods for determining the type of treatment and prognosis of tumoral diseases]</title><score datatype="http://www.w3.org/2001/XMLSchema#double">1.67</score></solution><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><score datatype="http://www.w3.org/2001/XMLSchema#double">1.38</score></solution></query></answer>
+<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><score/></variables><solution><pmid resource="urn:pmid:11076292"/><title>[The role of immunohistochemical methods for determining the type of treatment and prognosis of tumoral diseases]</title><score datatype="http://www.w3.org/2001/XMLSchema#double">1.50</score></solution><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><score datatype="http://www.w3.org/2001/XMLSchema#double">1.20</score></solution></query></answer>
Modified: trunk/jxdata/iTQL/fulltext_queries/queryResult18.txt
===================================================================
--- trunk/jxdata/iTQL/fulltext_queries/queryResult18.txt 2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/jxdata/iTQL/fulltext_queries/queryResult18.txt 2009-01-21 12:44:47 UTC (rev 1446)
@@ -1,2 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
-<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><sc1/><sc2/></variables><solution><pmid resource="urn:pmid:11076292"/><title>[The role of immunohistochemical methods for determining the type of treatment and prognosis of tumoral diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.67</sc1></solution><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.38</sc1></solution><solution><pmid resource="urn:pmid:11244589"/><title>[Cholelithiasis in heart transplant patients]</title><sc2 datatype="http://www.w3.org/2001/XMLSchema#double">2.75</sc2></solution></query></answer>
+<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><sc1/><sc2/></variables><solution><pmid resource="urn:pmid:11076292"/><title>[The role of immunohistochemical methods for determining the type of treatment and prognosis of tumoral diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.50</sc1></solution><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.20</sc1></solution><solution><pmid resource="urn:pmid:11244589"/><title>[Cholelithiasis in heart transplant patients]</title><sc2 datatype="http://www.w3.org/2001/XMLSchema#double">2.61</sc2></solution></query></answer>
Modified: trunk/jxdata/iTQL/fulltext_queries/queryResult19.txt
===================================================================
--- trunk/jxdata/iTQL/fulltext_queries/queryResult19.txt 2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/jxdata/iTQL/fulltext_queries/queryResult19.txt 2009-01-21 12:44:47 UTC (rev 1446)
@@ -1,2 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
-<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><sc1/><sc2/></variables><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.38</sc1><sc2 datatype="http://www.w3.org/2001/XMLSchema#double">1.04</sc2></solution></query></answer>
+<answer xmlns="http://mulgara.org/tql#"><query><variables><pmid/><title/><sc1/><sc2/></variables><solution><pmid resource="urn:pmid:11076294"/><title>[Ultrastructural phenotypes of tumor cells of endocrine-cellular neoplasms of hepatopancreatoduodenal organs and their role in determining the degree of malignancy and prognosis of these diseases]</title><sc1 datatype="http://www.w3.org/2001/XMLSchema#double">1.20</sc1><sc2 datatype="http://www.w3.org/2001/XMLSchema#double">0.92</sc2></solution></query></answer>
Modified: trunk/jxdata/iTQL/fulltext_queries/test.jxu
===================================================================
--- trunk/jxdata/iTQL/fulltext_queries/test.jxu 2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/jxdata/iTQL/fulltext_queries/test.jxu 2009-01-21 12:44:47 UTC (rev 1446)
@@ -314,8 +314,8 @@
order by $pmid $title;"/>
<eval stepClass="org.mulgara.store.jxunit.QueryJX" />
- <subst name="queryResult" regexp="1\.67[0-9]*" value="1.67"/>
- <subst name="queryResult" regexp="1\.38[0-9]*" value="1.38"/>
+ <subst name="queryResult" regexp="1\.(50|51)[0-9]*" value="1.50"/>
+ <subst name="queryResult" regexp="1\.(20|21)[0-9]*" value="1.20"/>
<ifEqual converse="true" file="queryResult17.txt" name="queryResult">
<save name="queryResult" file="badQuery17Result.xml"/>
<fail>Output failed. Check badQuery17Result.xml for output.</fail>
@@ -334,9 +334,9 @@
order by $pmid $title;"/>
<eval stepClass="org.mulgara.store.jxunit.QueryJX" />
- <subst name="queryResult" regexp="1\.67[0-9]*" value="1.67"/>
- <subst name="queryResult" regexp="1\.38[0-9]*" value="1.38"/>
- <subst name="queryResult" regexp="2\.75[0-9]*" value="2.75"/>
+ <subst name="queryResult" regexp="1\.(50|51)[0-9]*" value="1.50"/>
+ <subst name="queryResult" regexp="1\.(20|21)[0-9]*" value="1.20"/>
+ <subst name="queryResult" regexp="2\.(61|62)[0-9]*" value="2.61"/>
<ifEqual converse="true" file="queryResult18.txt" name="queryResult">
<save name="queryResult" file="badQuery18Result.xml"/>
<fail>Output failed. Check badQuery18Result.xml for output.</fail>
@@ -355,8 +355,8 @@
order by $pmid $title;"/>
<eval stepClass="org.mulgara.store.jxunit.QueryJX" />
- <subst name="queryResult" regexp="1\.38[0-9]*" value="1.38"/>
- <subst name="queryResult" regexp="1\.04[0-9]*" value="1.04"/>
+ <subst name="queryResult" regexp="1\.(20|21)[0-9]*" value="1.20"/>
+ <subst name="queryResult" regexp="0\.(92|93)[0-9]*" value="0.92"/>
<ifEqual converse="true" file="queryResult19.txt" name="queryResult">
<save name="queryResult" file="badQuery19Result.xml"/>
<fail>Output failed. Check badQuery19Result.xml for output.</fail>
Modified: trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndex.java
===================================================================
--- trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndex.java 2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndex.java 2009-01-21 12:44:47 UTC (rev 1446)
@@ -53,6 +53,7 @@
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
@@ -561,13 +562,15 @@
" predicate :" + predicate + " literal :" + literal);
}
- if ((subject != null) && (subject.length() > 0)) {
+ if (subject != null) {
TermQuery tSubject = new TermQuery(new Term(SUBJECT_KEY, subject));
+ tSubject.setBoost(0);
bQuery.add(tSubject, BooleanClause.Occur.MUST);
}
- if ((predicate != null) && (predicate.length() > 0)) {
+ if (predicate != null) {
TermQuery tPredicate = new TermQuery(new Term(PREDICATE_KEY, predicate));
+ tPredicate.setBoost(0);
bQuery.add(tPredicate, BooleanClause.Occur.MUST);
}
@@ -652,6 +655,64 @@
}
/**
+ * The maximum number of documents the given query could return.
+ *
+ * @param subject the subject; may be null
+ * @param predicate the predicate; may be null
+ * @param literal literal to be searched via the analyzer; may be null
+ * @return the maximum number of documents
+ * @throws FullTextStringIndexException if an error occurred
+ */
+ public long getMaxDocs(String subject, String predicate, String object)
+ throws FullTextStringIndexException {
+ long total = -1;
+
+ try {
+ if (subject != null) {
+ total = indexSearcher.docFreq(new Term(SUBJECT_KEY, subject));
+ if (total == 0) return 0;
+ }
+
+ if (predicate != null) {
+ if (total > 0)
+ total = Math.min(indexSearcher.docFreq(new Term(PREDICATE_KEY, subject)), total);
+ if (total == 0) return 0;
+ }
+
+ if (object != null) {
+ QueryParser parser = new QueryParser(LITERAL_KEY, analyzer);
+ total = findMinDocCount(parser.parse(object), total);
+ }
+
+ return (total >= 0) ? total : indexSearcher.maxDoc();
+ } catch (IOException ioe) {
+ closeIndexers = true;
+ throw new FullTextStringIndexException("Unable to count results for query '" + object + "'", ioe);
+ } catch (ParseException pe) {
+ throw new FullTextStringIndexException("Unable to parse query '" + object + "'", pe);
+ }
+ }
+
+ private long findMinDocCount(Query q, long max) throws IOException {
+ long count = max;
+
+ if (q instanceof TermQuery) {
+ Term term = ((TermQuery)q).getTerm();
+ count = Math.min(indexSearcher.docFreq(term), count);
+ } else if (q instanceof BooleanQuery) {
+ for (BooleanClause clause : ((BooleanQuery)q).getClauses()) {
+ if (clause.isRequired()) count = findMinDocCount(clause.getQuery(), count);
+ }
+ } else if (q instanceof PhraseQuery) {
+ for (Term term : ((PhraseQuery)q).getTerms()) {
+ count = Math.min(indexSearcher.docFreq(term), count);
+ }
+ }
+
+ return count;
+ }
+
+ /**
* Acquire the indexers.
*
* @param forWrites whether to acquire an index writer
Modified: trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndexTuples.java
===================================================================
--- trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndexTuples.java 2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/FullTextStringIndexTuples.java 2009-01-21 12:44:47 UTC (rev 1446)
@@ -40,6 +40,8 @@
import org.apache.lucene.document.Document;
// JRDf
+import org.jrdf.graph.BlankNode;
+import org.jrdf.graph.Literal;
import org.jrdf.graph.URIReference;
// local packages
@@ -82,9 +84,7 @@
/** Logger. */
private final static Logger logger = Logger.getLogger(FullTextStringIndexTuples.class);
- /**
- * The native Lucene query result to represent as a {@link Tuples}.
- */
+ /** The native Lucene query result to represent as a {@link Tuples}. */
private FullTextStringIndex.Hits hits;
/**
@@ -94,26 +94,28 @@
*/
private Document document;
- /**
- * The index of the next {@link #document} within the {@link #hits}.
- */
+ /** The index of the next {@link #document} within the {@link #hits}. */
private int nextDocumentIndex = 0;
- /**
- * Session used to localize Lucene text into string pool nodes.
- */
- private ResolverSession session;
+ /** Session used to localize Lucene text into string pool nodes. */
+ private final ResolverSession session;
- /**
- * The number of items in to tuples
- */
+ /** The number of items in tuples */
private long rowCount = -1;
+ /** The upper bound on the number of items in tuples */
+ private long rowUpperBound = -1;
+
private final List<Variable> variableList = new ArrayList<Variable>(3);
private final List<String> luceneKeyList = new ArrayList<String>(3);
- private LuceneConstraint constraint;
+ private final FullTextStringIndex fullTextStringIndex;
+ private final LuceneConstraint constraint;
+ private final ConstraintElement subjectElement;
+ private final ConstraintElement predicateElement;
+ private final String object;
+
//
// Constructor
//
@@ -133,49 +135,31 @@
*/
FullTextStringIndexTuples(FullTextStringIndex fullTextStringIndex,
LuceneConstraint constraint, ResolverSession session) throws QueryException {
+ this.fullTextStringIndex = fullTextStringIndex;
this.session = session;
this.constraint = constraint;
try {
- // Validate and globalize subject
- String subject = null;
- ConstraintElement subjectElement = constraint.getSubject();
+ // process subject
+ subjectElement = constraint.getSubject();
if (subjectElement instanceof Variable) {
variableList.add((Variable)subjectElement);
luceneKeyList.add(FullTextStringIndex.SUBJECT_KEY);
- } else if (subjectElement instanceof LocalNode) {
- try {
- URIReference subjectURI = (URIReference) session.globalize(((
- LocalNode) subjectElement).getValue());
- subject = subjectURI.getURI().toString();
- } catch (ClassCastException ec) {
- throw new QueryException("Bad subject in Lucene constraint", ec);
- }
}
- // Validate and globalize predicate
- String predicate = null;
- ConstraintElement predicateElement = constraint.getPredicate();
+ // process predicate
+ predicateElement = constraint.getPredicate();
+
if (predicateElement instanceof Variable) {
variableList.add((Variable)predicateElement);
luceneKeyList.add(FullTextStringIndex.PREDICATE_KEY);
- } else if (predicateElement instanceof LocalNode) {
- try {
- URIReference predicateURI = (URIReference) session.globalize(((
- LocalNode) predicateElement).getValue());
- predicate = predicateURI.getURI().toString();
- } catch (ClassCastException ec) {
- throw new QueryException("Bad predicate in Lucene constraint", ec);
- }
}
- // Validate and globalize object
- String object;
+ // process object
ConstraintElement objectElement = constraint.getObject();
try {
- LiteralImpl objectLiteral = (LiteralImpl) session.globalize(((LocalNode)
- objectElement).getValue());
+ LiteralImpl objectLiteral = (LiteralImpl)session.globalize(((LocalNode)objectElement).getValue());
object = objectLiteral.getLexicalForm();
} catch (ClassCastException e) {
throw new QueryException("The object of any rdf:object statement in a mulgara:LuceneModel " +
@@ -188,16 +172,9 @@
variableList.add(score);
}
- if (logger.isInfoEnabled()) {
- logger.info("Searching for " + subject + " : " + predicate + " : " + object);
- }
- // Initialize fields
- hits = fullTextStringIndex.find(subject, predicate, object);
setVariables(variableList);
} catch (GlobalizeException e) {
throw new QueryException("Couldn't globalize constraint elements", e);
- } catch (FullTextStringIndexException e) {
- throw new QueryException("Couldn't generate answer from text index", e);
}
}
@@ -205,15 +182,49 @@
// Implementation of AbstractTuples methods
//
- public void beforeFirst(long[] prefix,
- int suffixTruncation) throws TuplesException {
+ public void beforeFirst(long[] prefix, int suffixTruncation) throws TuplesException {
+ String subject = getString(subjectElement, prefix.length > 0 ? prefix[0] : 0);
+ String predicate = getString(predicateElement, prefix.length > 1 ? prefix[1] : 0);
+
+ if (logger.isDebugEnabled()) {
+ logger.debug("Searching for " + subject + " : " + predicate + " : " + object);
+ }
+
+ try {
+ hits = fullTextStringIndex.find(subject, predicate, object);
+ } catch (FullTextStringIndexException e) {
+ throw new TuplesException("Couldn't generate answer from text index: subject='" + subject +
+ "', predicate='" + predicate + "', object='" + object + "'", e);
+ }
+
+ //Tuples tuples = TuplesOperations.sort(tmpTuples);
+
document = null;
nextDocumentIndex = 0;
+ rowCount = -1;
+ rowUpperBound = -1;
}
+ private String getString(ConstraintElement ce, long boundVal) throws TuplesException {
+ if (ce instanceof LocalNode) boundVal = ((LocalNode)ce).getValue();
+
+ if (boundVal == 0) return null;
+
+ try {
+ Object val = session.globalize(boundVal);
+ if (val instanceof URIReference) return ((URIReference)val).getURI().toString();
+ if (val instanceof Literal) return ((Literal)val).getLexicalForm();
+ if (val instanceof BlankNode) return "";
+
+ throw new TuplesException("Unknown node-type for Lucene constraint '" + ce + "': local-value=" + boundVal + ", global-value=" + val + ", class=" + val.getClass());
+ } catch (GlobalizeException e) {
+ throw new TuplesException("Couldn't globalize value " + boundVal, e);
+ }
+ }
+
public void close() throws TuplesException {
try {
- hits.close();
+ if (hits != null) hits.close();
} catch (IOException ioe) {
throw new TuplesException("Error closing fulltext index hits", ioe);
}
@@ -221,7 +232,7 @@
public FullTextStringIndexTuples clone() {
FullTextStringIndexTuples clone = (FullTextStringIndexTuples) super.clone();
- clone.hits = hits.clone();
+ if (hits != null) clone.hits = hits.clone();
return clone;
}
@@ -239,8 +250,7 @@
} catch (IOException e) {
throw new TuplesException("Couldn't get column " + column + " value", e);
} catch (LocalizeException e) {
- throw new TuplesException("Couldn't localize column " + column + " value",
- e);
+ throw new TuplesException("Couldn't localize column " + column + " value", e);
} catch (URISyntaxException e) {
throw new TuplesException("Couldn't get column " + column + " value", e);
}
@@ -255,9 +265,39 @@
}
public long getRowUpperBound() throws TuplesException {
- return getRowCount();
+ if (rowUpperBound == -1) {
+ try {
+ rowUpperBound = (hits != null) ? getRowCount() :
+ fullTextStringIndex.getMaxDocs(getString(subjectElement, 0), getString(predicateElement, 0), object);
+ } catch (FullTextStringIndexException e) {
+ throw new TuplesException("Couldn't row upper-bound from text index: subject='" +
+ getString(subjectElement, 0) + "', predicate='" +
+ getString(predicateElement, 0) + "', object='" + object + "'", e);
+ }
+ }
+
+ return rowUpperBound;
}
+ public int getRowCardinality() throws TuplesException {
+ long bound = getRowUpperBound();
+
+ if (bound == 0) return Tuples.ZERO;
+ if (bound == 1) return Tuples.ONE;
+ return Tuples.MANY;
+
+ /* Exact, but slower
+ if (getRowUpperBound() == 0) return Tuples.ZERO;
+
+ if (hits == null) beforeFirst();
+
+ long count = getRowCount();
+ if (count == 0) return Tuples.ZERO;
+ if (count == 1) return Tuples.ONE;
+ return Tuples.MANY;
+ */
+ }
+
/**
* Lucene never generates unbound columns.
*
@@ -276,6 +316,8 @@
}
public boolean next() throws TuplesException {
+ assert hits != null : "next() called without beforeFirst()";
+
try {
if (nextDocumentIndex < getRowCount()) {
document = hits.doc(nextDocumentIndex++);
Modified: trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/LuceneResolver.java
===================================================================
--- trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/LuceneResolver.java 2009-01-19 19:31:23 UTC (rev 1445)
+++ trunk/src/jar/resolver-lucene/java/org/mulgara/resolver/lucene/LuceneResolver.java 2009-01-21 12:44:47 UTC (rev 1446)
@@ -372,13 +372,7 @@
// generate the tuples
try {
FullTextStringIndex stringIndex = getFullTextStringIndex(((LocalNode)modelElement).getValue());
- Tuples tmpTuples = new FullTextStringIndexTuples(stringIndex, (LuceneConstraint) constraint, resolverSession);
- Tuples tuples = TuplesOperations.sort(tmpTuples);
- tmpTuples.close();
-
- return new TuplesWrapperResolution(tuples, constraint);
- } catch (TuplesException te) {
- throw new QueryException("Failed to sort tuples and close", te);
+ return new FullTextStringIndexTuples(stringIndex, (LuceneConstraint)constraint, resolverSession);
} catch (IOException ioe) {
throw new QueryException("Failed to open string index", ioe);
} catch (FullTextStringIndexException ef) {
More information about the Mulgara-svn
mailing list