modify indexing and search logic to account for phrases

This commit is contained in:
Zlatin Balevsky
2019-11-05 13:24:22 +00:00
parent 7e881f1fe6
commit 8dcba7535c
2 changed files with 45 additions and 4 deletions

View File

@ -31,25 +31,48 @@ class SearchIndex {
}
}
private static String[] split(String source) {
source = source.replaceAll(SplitPattern.SPLIT_PATTERN, " ").toLowerCase()
String [] split = source.split(" ")
private static String[] split(final String source) {
// first split by split pattern
String sourceSplit = source.replaceAll(SplitPattern.SPLIT_PATTERN, " ").toLowerCase()
String [] split = sourceSplit.split(" ")
def rv = []
split.each { if (it.length() > 0) rv << it }
// then just by ' '
source.split(' ').each { if (it.length() > 0) rv << it }
// and add original string
rv << source
rv.toArray(new String[0])
}
String[] search(List<String> terms) {
Set<String> rv = null;
Set<String> powerSet = new HashSet<>()
terms.each {
powerSet.addAll(it.toLowerCase().split(' '))
}
powerSet.each {
Set<String> forWord = keywords.getOrDefault(it,[])
if (rv == null) {
rv = new HashSet<>(forWord)
} else {
rv.retainAll(forWord)
}
}
// now, filter by terms
for (Iterator<String> iter = rv.iterator(); iter.hasNext();) {
String candidate = iter.next()
candidate = candidate.toLowerCase()
boolean keep = true
terms.each {
keep &= candidate.contains(it)
}
if (!keep)
iter.remove()
}
if (rv != null)

View File

@ -90,4 +90,22 @@ class SearchIndexTest {
def found = index.search(["muwire", "0", "3", "jar"])
assert found.size() == 1
}
@Test
void testOriginalText() {
initIndex(["a-b c-d"])
def found = index.search(['a-b'])
assert found.size() == 1
found = index.search(['c-d'])
assert found.size() == 1
}
@Test
void testPhrase() {
initIndex(["a-b c-d e-f"])
def found = index.search(['a-b c-d'])
assert found.size() == 1
assert index.search(['c-d e-f']).size() == 1
assert index.search(['a-b e-f']).size() == 0
}
}