The following steps describe the logic used to create a simple Lucene index from the Recovery.gov data. This sample project is available on GitHub here: http://github.com/tobrien/sample-parse
1. Download the Raw XML Data from Recovery.gov
2. Add the following dependencies to a Java application:
<dependency>
<groupId>commons-digester</groupId>
<artifactId>commons-digester</artifactId>
<version>2.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
</dependency>3. Place the "grants.xml" from Recovery.gov into a directory named "data/".
4. Write the following class to parse the Recovery.gov grants.xml file and populate a Lucene index with some basic field:
package com.discursive.sample.parse;
import java.io.File;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
public class ParsingGrants {
private Logger logger = Logger.getLogger(ParsingGrants.class);
public static void main(String[] args) throws Exception {
BasicConfigurator.configure();
Logger.getRootLogger().setLevel(Level.INFO);
new ParsingGrants().go();
}
public void go() throws Exception {
logger.info("Creating Index");
Directory index = new SimpleFSDirectory(new File("index"));
IndexWriter writer = new IndexWriter(index, new SimpleAnalyzer(), true,
IndexWriter.MaxFieldLength.UNLIMITED);
GrantIndexer grantIndexer = new GrantIndexer(writer);
grantIndexer.init();
grantIndexer.index(new File("./data/grants.xml"));
writer.optimize();
writer.close();
logger.info("Parsing Complete, Index Created");
}
}5. This main class references a GrantIndexer which takes is responsible for adding documents to a Lucene index:
package com.discursive.sample.parse;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import org.apache.commons.digester.Digester;
import org.apache.commons.digester.xmlrules.DigesterLoader;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.xml.sax.SAXException;
public class GrantIndexer {
private Logger logger = Logger.getLogger( GrantIndexer.class );
private IndexWriter indexWriter;
private Digester digester;
private DigestContext context;
public GrantIndexer(IndexWriter pIndexWriter) {
indexWriter = pIndexWriter;
}
public void init( ) {
URL grantRules =
GrantIndexer.class.getResource("grant-digester-rules.xml");
digester = DigesterLoader.createDigester( grantRules );
}
public void index(File grantsXml) throws IOException, SAXException {
context = new DigestContext( );
digester.push( context );
digester.parse( grantsXml );
}
public void processEntry( ) {
Document doc = new Document( );
doc.add(new Field("id", context.grantId, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("zip", context.zip, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("funding", context.funding, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("city", context.city, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("state", context.state, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("title", context.title, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("desc", context.desc, Field.Store.YES, Field.Index.ANALYZED));
try {
indexWriter.addDocument( doc );
} catch( IOException ioe ) {
logger.error( "Unable to add document to index", ioe);
}
}
public class DigestContext {
String grantId, zip, funding, state, city, title, desc;
public void setGrantId(String grantId) { this.grantId = grantId; }
public void setZip(String zip) { this.zip = zip; }
public void setFunding(String funding) { this.funding = funding; }
public void setCity(String city) { this.city = city; }
public void setState(String state) { this.state = state; }
public void setTitle(String title) { this.title = title; }
public void setDesc(String desc) { this.desc = desc; }
public void grantEnd( ) {
processEntry( );
}
}
}6. Lastly, these are the simple digester rules which populate the DigestContext bean defined in GrantIndexer:
<?xml version="1.0"?>
<digester-rules>
<pattern value="feed">
<pattern value="entry">
<bean-property-setter-rule pattern="id"
propertyname="grantId"/>
<bean-property-setter-rule pattern="content/recipient_zip"
propertyname="zip"/>
<bean-property-setter-rule pattern="content/recipient_city_name"
propertyname="city"/>
<bean-property-setter-rule pattern="content/recipient_state_code"
propertyname="state"/>
<bean-property-setter-rule pattern="content/fed_funding_amount"
propertyname="funding"/>
<bean-property-setter-rule pattern="content/cfda_program_title"
propertyname="title"/>
<bean-property-setter-rule pattern="content/proj_desc"
propertyname="desc"/>
<call-method-rule methodname="grantEnd"
paramtype="java.lang.Object"/>
</pattern>
</pattern>
</digester-rules>7. Running this ParsingGrants should take approximately 20-30 seconds, after which you will have a Lucene index in a directory named "index/"
Help










