001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.utils;
023import java.io.BufferedReader;
024import java.io.IOException;
025import java.io.StringReader;
026import java.util.regex.Pattern;
027
028import javax.xml.parsers.ParserConfigurationException;
029import javax.xml.parsers.SAXParser;
030import javax.xml.parsers.SAXParserFactory;
031
032import org.xml.sax.InputSource;
033import org.xml.sax.SAXException;
034import org.xml.sax.helpers.DefaultHandler;
035
036/**
037 * Utility class for reading chunks of XML files and feeding them to SAX.
038 * @author Richard Holland
039 * @since 1.5
040 */
041public class XMLTools {
042    
043    // Static methods so should never be instantiated.
044    private XMLTools() {}
045    
046    /**
047     * Attempts to read XML file in chunks, passing each chunk to a SAX parser.
048     * As each chunk is read into memory in a buffer, you need to ensure that each chunk
049     * is small enough to fit into available memory. Only one chunk is held in memory
050     * at any one time, and then only long enough for it to be parsed.
051     * When checking for the presence of further chunks, it'll only read up to 1000 chars
052     * further into the file, after which results will be unpredictable.
053     * @param reader the reader to read the XML from
054     * @param m_handler the SAX parser to feed the XML to
055     * @param chunkToken the token to read. The parser will locate the first instance of
056     * <chunkToken and will buffer all content, including the opening tag and up to
057     * and including the closing </chunkToken> tag. It will not currently handle
058     * <chunkToken/> instances, nor instances where more than one tag appears per line,
059     * or extra spaces appear between the angle brackets, slashes, and tag name of the
060     * tag we are searching for.
061     * @return true if there is another chunk left to read after this one, false if not.
062     * @throws ParserConfigurationException if there was a problem setting up the SAX parser.
063     * @throws SAXException if there was a problem parsing the XML.
064     * @throws IOException if there was a problem reading the XML from the reader.
065     */
066    public static boolean readXMLChunk(BufferedReader reader, DefaultHandler m_handler, String chunkToken) throws ParserConfigurationException, SAXException, IOException {
067        // read next chunk from <chunkToken> to <chunkToken/> inclusive into buffer
068        // process buffer through XML parser
069        StringBuffer buffer = new StringBuffer();
070
071        Pattern start = Pattern.compile(".*<"+chunkToken+".*");
072        Pattern end = Pattern.compile(".*</"+chunkToken+">.*");
073        
074        boolean begunChunk = false;
075        boolean filledBuffer = false;
076        String line = null;
077        while (!filledBuffer && (line=reader.readLine())!=null) {
078            line = line.trim();
079            if (!begunChunk && !start.matcher(line).matches()) continue;
080            else begunChunk = true;
081            buffer.append(line+"\n");
082            if (end.matcher(line).matches()) filledBuffer = true;
083        }
084        if (!filledBuffer) throw new SAXException("Unexpectedly reached end of file");
085        reader.mark(10000);
086        boolean hasAnotherChunk = false;
087        while (!hasAnotherChunk && (line=reader.readLine())!=null) {
088            line = line.trim();
089            if (start.matcher(line).matches()) hasAnotherChunk = true;
090        }
091        reader.reset();
092        
093        SAXParser m_xmlParser;
094        SAXParserFactory factory = SAXParserFactory.newInstance();
095        factory.setValidating(true);
096        m_xmlParser = factory.newSAXParser();
097        
098        InputSource source = new InputSource(new StringReader(buffer.toString()));
099        m_xmlParser.parse(source, m_handler);
100        
101        // return true if there are more in our buffer
102        return hasAnotherChunk;
103    }
104}