Jump to content

Parse the Recovery.gov Grant Data and Create a Lucene Index

VOTE
+ 1
  • -
  • +
  tmo9d's Photo
Posted Sep 29 2009 08:45 AM

Maybe you are interested in running some frequency analysis, or creating a full-text search database for the Recovery.gov data? If so, you'll need to download the raw XML database from the Recovery.gov site, and write some code to parse it. I've created a stub project that uses Apache Commons Digester to create an Apache Lucene index.

The following steps describe the logic used to create a simple Lucene index from the Recovery.gov data. This sample project is available on GitHub here: http://github.com/tobrien/sample-parse

1. Download the Raw XML Data from Recovery.gov

2. Add the following dependencies to a Java application:

<dependency>
    	<groupId>commons-digester</groupId>
    	<artifactId>commons-digester</artifactId>
    	<version>2.0</version>
    </dependency>
    <dependency>
    	<groupId>org.apache.lucene</groupId>
    	<artifactId>lucene-core</artifactId>
    	<version>2.9.0</version>
    </dependency>
    <dependency>
    	<groupId>commons-collections</groupId>
    	<artifactId>commons-collections</artifactId>
    	<version>3.2.1</version>
    </dependency>


3. Place the "grants.xml" from Recovery.gov into a directory named "data/".

4. Write the following class to parse the Recovery.gov grants.xml file and populate a Lucene index with some basic field:

package com.discursive.sample.parse;

import java.io.File;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;

public class ParsingGrants {

	private Logger logger = Logger.getLogger(ParsingGrants.class);

	public static void main(String[] args) throws Exception {
		BasicConfigurator.configure();
		Logger.getRootLogger().setLevel(Level.INFO);
		new ParsingGrants().go();
	}

	public void go() throws Exception {
		logger.info("Creating Index");
		Directory index = new SimpleFSDirectory(new File("index"));
		IndexWriter writer = new IndexWriter(index, new SimpleAnalyzer(), true,
				IndexWriter.MaxFieldLength.UNLIMITED);
		GrantIndexer grantIndexer = new GrantIndexer(writer);
		grantIndexer.init();
		grantIndexer.index(new File("./data/grants.xml"));
		writer.optimize();
		writer.close();
		logger.info("Parsing Complete, Index Created");
	}

}


5. This main class references a GrantIndexer which takes is responsible for adding documents to a Lucene index:

package com.discursive.sample.parse;

import java.io.File;
import java.io.IOException;
import java.net.URL;

import org.apache.commons.digester.Digester;
import org.apache.commons.digester.xmlrules.DigesterLoader;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.xml.sax.SAXException;

public class GrantIndexer {

	private Logger logger = Logger.getLogger( GrantIndexer.class );
	
	private IndexWriter indexWriter;
    private Digester digester;
    private DigestContext context;
    public GrantIndexer(IndexWriter pIndexWriter) {
        indexWriter = pIndexWriter;
    }
    
    public void init( ) {
        URL grantRules = 
            GrantIndexer.class.getResource("grant-digester-rules.xml");
        digester = DigesterLoader.createDigester( grantRules );
    }
        
    public void index(File grantsXml) throws IOException, SAXException {
        context = new DigestContext( );
        digester.push( context );
        digester.parse( grantsXml );
    }
    
    public void processEntry( ) {
        Document doc = new Document( );
        doc.add(new Field("id", context.grantId, Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("zip", context.zip, Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("funding", context.funding, Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("city", context.city, Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("state", context.state, Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("title", context.title, Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field("desc", context.desc, Field.Store.YES, Field.Index.ANALYZED));
        
        try {
            indexWriter.addDocument( doc );
        } catch( IOException ioe ) {
            logger.error( "Unable to add document to index", ioe);
        }
    }
    
    public class DigestContext {
        String grantId, zip, funding, state, city, title, desc;
        public void setGrantId(String grantId) { this.grantId = grantId; }
        public void setZip(String zip) { this.zip = zip; }
        public void setFunding(String funding) { this.funding = funding; }
        public void setCity(String city) { this.city = city; }
        public void setState(String state) { this.state = state; }
        public void setTitle(String title) { this.title = title; }
        public void setDesc(String desc) { this.desc = desc; }
		public void grantEnd( ) {
            processEntry( );
        }
    }
}


6. Lastly, these are the simple digester rules which populate the DigestContext bean defined in GrantIndexer:

<?xml version="1.0"?>
<digester-rules>
    <pattern value="feed">
    <pattern value="entry">
        <bean-property-setter-rule pattern="id"
                                   propertyname="grantId"/>
        <bean-property-setter-rule pattern="content/recipient_zip"
                                   propertyname="zip"/>
        <bean-property-setter-rule pattern="content/recipient_city_name"
                                   propertyname="city"/>
        <bean-property-setter-rule pattern="content/recipient_state_code"
                                   propertyname="state"/>
        <bean-property-setter-rule pattern="content/fed_funding_amount"
                                   propertyname="funding"/>
        <bean-property-setter-rule pattern="content/cfda_program_title"
                                   propertyname="title"/>
        <bean-property-setter-rule pattern="content/proj_desc"
                                   propertyname="desc"/>
        <call-method-rule methodname="grantEnd"	
                                   paramtype="java.lang.Object"/>
    </pattern>
    </pattern>
</digester-rules>


7. Running this ParsingGrants should take approximately 20-30 seconds, after which you will have a Lucene index in a directory named "index/"

Cover of Hadoop: The Definitive Guide
Learn more about this topic from Hadoop: The Definitive Guide.  Apache Hadoop is ideal for organizations with a growing need to process massive application datasets. Hadoop: The Definitive Guide is a comprehensive resource for using Hadoop to build reliable, scalable, distributed systems. Programmers will find details for analyzing large datasets with Hadoop, and administrators will learn how to set up and run Hadoop clusters. The book includes case studies that illustrate how Hadoop is used to solve specific problems.
Learn More Read Now on Safari







0 Alternative Solutions | 0 Comments

filter by: