001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.db.emblcd; 023 024import java.io.IOException; 025import java.io.InputStream; 026 027/** 028 * <p><code>EmblCDROMIndexReader</code> is an abstract class whose 029 * concrete subclasses read EMBL CD-ROM format indices from an 030 * underlying <code>InputStream</code>. This format is used by the 031 * EMBOSS package for database indexing (see programs dbiblast, 032 * dbifasta, dbiflat and dbigcg). Indexing produces four binary files 033 * with a simple format:</p> 034 * 035 * <ul> 036 * <li>division.lkp : master index</li> 037 * <li>entrynam.idx : sequence ID index</li> 038 * <li> acnum.trg : accession number index</li> 039 * <li> acnum.hit : accession number auxiliary index</li> 040 * </ul> 041 * 042 * <p>Internally EMBOSS checks for Big-endian architechtures and 043 * switches the byte order to Little-endian. This means trouble if you 044 * try to read the file using <code>DataInputStream</code>, but at 045 * least the binaries are consistent across architechtures. This class 046 * carries out the necessary conversion.</p> 047 * 048 * <p>The EMBL CD-ROM format stores the date in 4 bytes. One byte is 049 * unused (the first one), leaving one byte for the day, one for the 050 * month and one (!) for the year.</p> 051 * 052 * <p> For further information see the EMBOSS documentation, or for a 053 * full description, the source code of the dbi programs and the Ajax 054 * library.</p> 055 * 056 * @author Keith James 057 * @since 1.2 058 */ 059public abstract class EmblCDROMIndexReader 060{ 061 protected InputStream input; 062 protected StringBuffer sb; 063 protected RecordParser recParser; 064 065 // Header fields 066 private byte [] int4 = new byte [4]; 067 private byte [] int2 = new byte [2]; 068 private byte [] dbName = new byte [20]; 069 private byte [] dbRelease = new byte [10]; 070 private byte [] dbDate = new byte [4]; 071 // Record field 072 private byte [] record; 073 074 private long fileLength; 075 private long recordCount; 076 private int recordLength; 077 private String name; 078 private String release; 079 private String date; 080 081 /** 082 * Creates a new <code>EmblCDROMIndexReader</code> instance. A 083 * <code>BufferedInputStream</code> is probably the most suitable. 084 * 085 * @param input an <code>InputStream</code>. 086 * 087 * @exception IOException if an error occurs. 088 */ 089 public EmblCDROMIndexReader(InputStream input) 090 throws IOException 091 { 092 this.input = input; 093 sb = new StringBuffer(512); 094 recParser = new RecordParser(); 095 096 parseHeader(); 097 } 098 099 /** 100 * <code>readFileLength</code> returns the file length in bytes 101 * (stored within the file's header by the indexing program). This 102 * may be called more than once as the value is cached. 103 * 104 * @return a <code>long</code>. 105 */ 106 public long readFileLength() 107 { 108 return fileLength; 109 } 110 111 /** 112 * <code>readRecordCount</code> returns the number of records in 113 * the file. This may be called more than once as the value is 114 * cached. 115 * 116 * @return a <code>long</code>. 117 */ 118 public long readRecordCount() 119 { 120 return recordCount; 121 } 122 123 /** 124 * <code>readRecordLength</code> returns the record length 125 * (bytes). This may be called more than once as the value is 126 * cached. 127 * 128 * @return an <code>int</code>. 129 */ 130 public int readRecordLength() 131 { 132 return recordLength; 133 } 134 135 /** 136 * <code>readDBName</code> returns the database name from the 137 * index header. This may be called more than once as the value is 138 * cached. 139 * 140 * @return a <code>String</code>. 141 */ 142 public String readDBName() 143 { 144 return name; 145 } 146 147 /** 148 * <code>readDBRelease</code> returns the database release from 149 * the index header. This may be called more than once as the 150 * value is cached. 151 * 152 * @return a <code>String</code>. 153 */ 154 public String readDBRelease() 155 { 156 return release; 157 } 158 159 /** 160 * <code>readDBDate</code> reads the date from the index 161 * header. The date is stored in 4 bytes: 0, unused; 1, year; 2, 162 * month; 3, day. With a 1 byte year it's not very much use and 163 * I'm not sure that the EMBOSS programs set the value correctly 164 * anyway. 165 * 166 * @return a <code>String</code>. 167 */ 168 public String readDBDate() 169 170 { 171 return date; 172 } 173 174 /** 175 * <code>readRecord</code> returns an array of objects parsed from 176 * a single record. Its content will depend on the type of index 177 * file. Concrete subclasses must provide an implementation of 178 * this method. 179 * 180 * @return an <code>Object []</code> array. 181 * 182 * @exception IOException if an error occurs. 183 */ 184 public abstract Object [] readRecord() throws IOException; 185 186 /** 187 * <code>readRawRecord</code> returns the raw bytes of a single 188 * record from the index. 189 * 190 * @return a <code>byte []</code> array. 191 * 192 * @exception IOException if an error occurs. 193 */ 194 public byte [] readRawRecord() throws IOException 195 { 196 int eof = input.read(record); 197 if (eof == -1) 198 input.close(); 199 200 return record; 201 } 202 203 /** 204 * <code>close</code> closes the underlying 205 * <code>InputStream</code>. 206 * 207 * @exception IOException if an error occurs. 208 */ 209 public void close() throws IOException 210 { 211 input.close(); 212 } 213 214 /** 215 * <code>parseHeader</code> carries out a full parse of the 300 216 * byte header (common to all the index types) when first 217 * encountered. 218 * 219 * @exception IOException if an error occurs. 220 */ 221 private void parseHeader() throws IOException 222 { 223 int eof = 0; 224 225 eof = input.read(int4); 226 if (eof == -1) 227 input.close(); 228 229 fileLength = recParser.parseInt4(int4); 230 231 eof = input.read(int4); 232 if (eof == -1) 233 input.close(); 234 235 recordCount = recParser.parseInt4(int4); 236 237 eof = input.read(int2); 238 if (eof == -1) 239 input.close(); 240 241 recordLength = recParser.parseInt2(int2); 242 243 // Set up array for reading records now that we know their 244 // length 245 record = new byte [recordLength]; 246 247 eof = input.read(dbName); 248 if (eof == -1) 249 input.close(); 250 251 sb.setLength(0); 252 name = recParser.parseString(sb, dbName); 253 254 eof = input.read(dbRelease); 255 if (eof == -1) 256 input.close(); 257 258 sb.setLength(0); 259 release = recParser.parseString(sb, dbRelease); 260 261 eof = input.read(dbDate); 262 if (eof == -1) 263 input.close(); 264 265 sb.setLength(0); 266 date = recParser.parseDate(sb, dbDate); 267 268 // Skip the remainder of the header (padding) 269 input.skip(256); 270 } 271}