001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.utils; 023import java.io.BufferedReader; 024import java.io.IOException; 025import java.io.StringReader; 026import java.util.regex.Pattern; 027 028import javax.xml.parsers.ParserConfigurationException; 029import javax.xml.parsers.SAXParser; 030import javax.xml.parsers.SAXParserFactory; 031 032import org.xml.sax.InputSource; 033import org.xml.sax.SAXException; 034import org.xml.sax.helpers.DefaultHandler; 035 036/** 037 * Utility class for reading chunks of XML files and feeding them to SAX. 038 * @author Richard Holland 039 * @since 1.5 040 */ 041public class XMLTools { 042 043 // Static methods so should never be instantiated. 044 private XMLTools() {} 045 046 /** 047 * Attempts to read XML file in chunks, passing each chunk to a SAX parser. 048 * As each chunk is read into memory in a buffer, you need to ensure that each chunk 049 * is small enough to fit into available memory. Only one chunk is held in memory 050 * at any one time, and then only long enough for it to be parsed. 051 * When checking for the presence of further chunks, it'll only read up to 1000 chars 052 * further into the file, after which results will be unpredictable. 053 * @param reader the reader to read the XML from 054 * @param m_handler the SAX parser to feed the XML to 055 * @param chunkToken the token to read. The parser will locate the first instance of 056 * <chunkToken and will buffer all content, including the opening tag and up to 057 * and including the closing </chunkToken> tag. It will not currently handle 058 * <chunkToken/> instances, nor instances where more than one tag appears per line, 059 * or extra spaces appear between the angle brackets, slashes, and tag name of the 060 * tag we are searching for. 061 * @return true if there is another chunk left to read after this one, false if not. 062 * @throws ParserConfigurationException if there was a problem setting up the SAX parser. 063 * @throws SAXException if there was a problem parsing the XML. 064 * @throws IOException if there was a problem reading the XML from the reader. 065 */ 066 public static boolean readXMLChunk(BufferedReader reader, DefaultHandler m_handler, String chunkToken) throws ParserConfigurationException, SAXException, IOException { 067 // read next chunk from <chunkToken> to <chunkToken/> inclusive into buffer 068 // process buffer through XML parser 069 StringBuffer buffer = new StringBuffer(); 070 071 Pattern start = Pattern.compile(".*<"+chunkToken+".*"); 072 Pattern end = Pattern.compile(".*</"+chunkToken+">.*"); 073 074 boolean begunChunk = false; 075 boolean filledBuffer = false; 076 String line = null; 077 while (!filledBuffer && (line=reader.readLine())!=null) { 078 line = line.trim(); 079 if (!begunChunk && !start.matcher(line).matches()) continue; 080 else begunChunk = true; 081 buffer.append(line+"\n"); 082 if (end.matcher(line).matches()) filledBuffer = true; 083 } 084 if (!filledBuffer) throw new SAXException("Unexpectedly reached end of file"); 085 reader.mark(10000); 086 boolean hasAnotherChunk = false; 087 while (!hasAnotherChunk && (line=reader.readLine())!=null) { 088 line = line.trim(); 089 if (start.matcher(line).matches()) hasAnotherChunk = true; 090 } 091 reader.reset(); 092 093 SAXParser m_xmlParser; 094 SAXParserFactory factory = SAXParserFactory.newInstance(); 095 factory.setValidating(true); 096 m_xmlParser = factory.newSAXParser(); 097 098 InputSource source = new InputSource(new StringReader(buffer.toString())); 099 m_xmlParser.parse(source, m_handler); 100 101 // return true if there are more in our buffer 102 return hasAnotherChunk; 103 } 104}