001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.io.embl; 022 023 024import java.io.*; 025import java.util.Arrays; 026import java.util.LinkedList; 027 028 029/** 030 * This class should process the data of embl file 031 * 032 * @author Noor Aldeen Al Mbaidin 033 * @since 5.0.0 034 */ 035public class EmblReader { 036 037 /** 038 * The parsing is done in this method.<br> 039 * This method tries to process all the Embl records 040 * in the File , closes the underlying resource, 041 * and return the results in object of EmblRecord.<br> 042 * 043 * @return EmblRecord containing all the parsed Embl records 044 * @throws IOException 045 */ 046 public static EmblRecord process(File file) throws IOException { 047 048 EmblRecord emblRecord = new EmblRecord(); 049 StringBuilder sequence = new StringBuilder(""); 050 LinkedList<EmblReference> emblReferences = new LinkedList<>(); 051 EmblReference emblReference = new EmblReference(); 052 LinkedList<String> accessionNumber = new LinkedList<>(); 053 LinkedList<String> keyword = new LinkedList<>(); 054 055 if (file == null) 056 throw new NullPointerException("file can't be null"); 057 058 if (file.isDirectory()) 059 throw new IllegalArgumentException("the file can't be a directory"); 060 061 try (FileReader fileReader = new FileReader(file)) { 062 String line = ""; 063 String lineIdentifier; 064 String lineInfo; 065 try (BufferedReader bufferedReader = new BufferedReader(fileReader)) { 066 while ((line = bufferedReader.readLine()) != null) { 067 if (line.length() > 1) { 068 lineInfo = line.substring(2, line.length()).trim(); 069 lineIdentifier = line.substring(0, 2); 070 if ("ID".equals(lineIdentifier)) 071 emblRecord.setEmblId(populateID(lineInfo)); 072 else if ("AC".equals(lineIdentifier)) 073 populateAccessionNumber(line, accessionNumber); 074 else if ("DT".equals(lineIdentifier) && line.contains("Created")) 075 emblRecord.setCreatedDate(lineInfo); 076 else if ("DT".equals(lineIdentifier) && line.contains("updated")) 077 emblRecord.setLastUpdatedDate(lineInfo); 078 else if ("DE".equals(lineIdentifier)) 079 emblRecord.setSequenceDescription(lineInfo); 080 else if ("KW".equals(lineIdentifier)) 081 keyword.add(lineInfo); 082 else if ("OS".equals(lineIdentifier)) 083 emblRecord.setOrganismSpecies(lineInfo); 084 else if ("OC".equals(lineIdentifier)) 085 emblRecord.setOrganismClassification(lineInfo); 086 else if ("OG".equals(lineIdentifier)) 087 emblRecord.setOrGanelle(lineInfo); 088 else if ("RN".equals(lineIdentifier) || "RP".equals(lineIdentifier) 089 || "RX".equals(lineIdentifier) || "RG".equals(lineIdentifier) 090 || "RA".equals(lineIdentifier) || "RT".equals(lineIdentifier) 091 || "RL".equals(lineIdentifier)) 092 populateEmblReferences(lineIdentifier, lineInfo, emblReference, emblReferences); 093 else if ("DR".equals(lineIdentifier)) 094 emblRecord.setDatabaseCrossReference(lineInfo); 095 else if ("AH".equals(lineIdentifier)) 096 emblRecord.setAssemblyHeader(lineInfo); 097 else if ("AS".equals(lineIdentifier)) 098 emblRecord.setAssemblyInformation(lineInfo); 099 else if ("CO".equals(lineIdentifier)) 100 emblRecord.setConstructedSequence(lineInfo); 101 else if ("FH".equals(lineIdentifier)) 102 emblRecord.setFeatureHeader(lineInfo); 103 else if ("FT".equals(lineIdentifier)) 104 emblRecord.setFeatureTable(lineInfo); 105 else if ("SQ".equals(lineIdentifier)) 106 emblRecord.setSequenceHeader(lineInfo); 107 else if (" ".equals(lineIdentifier) && !"//".equals(lineIdentifier)) 108 populateSequence(line, sequence); 109 else if ("//".equals(lineIdentifier)) { 110 emblRecord.setKeyword(keyword); 111 emblRecord.setEmblReference(emblReferences); 112 emblRecord.setAccessionNumber(accessionNumber); 113 emblRecord.setSequence(sequence.toString()); 114 } 115 116 } 117 } 118 } 119 } 120 121 return emblRecord; 122 } 123 124 private static void populateSequence(String line, StringBuilder sequence) { 125 String sequenceLine = line.replace(" ", ""). 126 replaceAll("[0-9]", ""); 127 sequence.append(sequenceLine); 128 } 129 130 private static void populateEmblReferences(String lineIdentifier, String lineInfo, EmblReference emblReference 131 , LinkedList<EmblReference> emblReferences) { 132 if ("RN".equals(lineIdentifier)) 133 emblReference.setReferenceNumber(lineInfo); 134 else if ("RP".equals(lineIdentifier)) 135 emblReference.setReferencePosition(lineInfo); 136 else if ("RX".equals(lineIdentifier)) 137 emblReference.setReferenceCrossReference(lineInfo); 138 else if ("RG".equals(lineIdentifier)) 139 emblReference.setReferenceGroup(lineInfo); 140 else if ("RA".equals(lineIdentifier)) 141 emblReference.setReferenceAuthor(lineInfo); 142 else if ("RT".equals(lineIdentifier)) 143 emblReference.setReferenceTitle(lineInfo); 144 else if ("RL".equals(lineIdentifier)) { 145 emblReference.setReferenceLocation(lineInfo); 146 emblReferences.add(emblReference.copyEmblReference(emblReference)); 147 } 148 } 149 150 private static void populateAccessionNumber(String line, LinkedList<String> accessionNumber) { 151 accessionNumber.add(line); 152 } 153 154 private static EmblId populateID(String line) { 155 String[] strings = line.split(";"); 156 Arrays.stream(strings).map(String::trim).toArray(unused -> strings); 157 EmblId emblId = new EmblId(strings[0], strings[1], strings[2] 158 , strings[3], strings[4], strings[5], strings[6]); 159 return emblId; 160 } 161 162 163}