Some fulltext would help. I recently stumbled over elasticsearch - schema-free, scalable search engine based on Apache Lucene. I decided to give it a try - not because its distributed nature, but for its REST interface.
I did following 4 steps to get simple fulltext search working:
1/ Extracted text from PDFs using pdftotext and simple bash one-liner.
for FILE in $(ls *.pdf); do pdftotext $FILE; done
2/ Created Java Maven project. The elasticsearch's pom.xml I found did not contain necessary dependencies, so I had to add them to my pom.xml and the result is a bit messy.
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>docsearch</groupId> <artifactId>docsearch</artifactId> <version>1.0</version> <repositories> <repository> <id>fuse</id> <url>http://repo.fusesource.com/maven2/</url> </repository> </repositories> <dependencies> <dependency> <groupId>org.elasticsearch</groupId> <artifactId>elasticsearch</artifactId> <version>0.16.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>3.3.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers</artifactId> <version>3.3.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-snowball</artifactId> <version>3.0.3</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-fast-vector-highlighter</artifactId> <version>3.0.3</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-highlighter</artifactId> <version>2.4.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queries</artifactId> <version>2.4.0</version> </dependency> </dependencies> </project>
3/ Downloaded the elesticsearch release and started it.
4/ Wrote a simple Java code to iterate over files, read them line-by-line and feed them to the running elasticsearch service:
import org.elasticsearch.action.index.IndexResponse; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.io.Files; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.elasticsearch.node.Node; import java.io.*; import static java.lang.System.out; import static org.elasticsearch.common.xcontent.XContentFactory.*; import static org.elasticsearch.node.NodeBuilder.*; public class Main { final static String dataDirName = "/tmp/doc"; public static void main (String[] args) { File dataDir = new File(dataDirName); if ( dataDir.exists() && dataDir.isDirectory() ) { File[] files = dataDir.listFiles ( new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith("txt"); } } ); // esearch client creation Node node = nodeBuilder().node(); Client client = new TransportClient() .addTransportAddress(new InetSocketTransportAddress("localhost", 9300)); String indexName = "docs"; String docType = "doc"; String docId = null; for (File file : files) { try { BufferedReader reader = new BufferedReader ( new FileReader(file) ); String line; StringBuilder fileContent = new StringBuilder(); while ( (line = reader.readLine()) != null) { fileContent.append(line); } docId = file.getName(); IndexResponse response = client.prepareIndex(indexName,docType,docId).setSource ( jsonBuilder() .startObject().field("content", fileContent).endObject() ) .execute().actionGet(); } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } } node.close(); } } }