001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.io.embl; 022 023 024import java.io.*; 025import java.util.Arrays; 026import java.util.LinkedList; 027 028 029/** 030 * This class should process the data of embl file 031 * 032 * @author Noor Aldeen Al Mbaidin 033 * @since 5.0.0 034 */ 035public class EmblReader { 036 037 /** 038 * The parsing is done in this method.<br> 039 * This method tries to process all the Embl records 040 * in the File , closes the underlying resource, 041 * and return the results in object of EmblRecord.<br> 042 * 043 * @return EmblRecord containing all the parsed Embl records 044 * @throws IOException 045 */ 046 public static EmblRecord process(File file) throws IOException { 047 048 EmblRecord emblRecord = new EmblRecord(); 049 StringBuilder sequence = new StringBuilder(""); 050 LinkedList<EmblReference> emblReferences = new LinkedList<>(); 051 EmblReference emblReference = new EmblReference(); 052 LinkedList<String> accessionNumber = new LinkedList<>(); 053 LinkedList<String> keyword = new LinkedList<>(); 054 055 if (file == null) 056 throw new NullPointerException("file can't be null"); 057 058 if (file.isDirectory()) 059 throw new IllegalArgumentException("the file can't be a directory"); 060 061 try (FileReader fileReader = new FileReader(file)) { 062 String line = ""; 063 String lineIdentifier; 064 String lineInfo; 065 try (BufferedReader bufferedReader = new BufferedReader(fileReader)) { 066 while ((line = bufferedReader.readLine()) != null) { 067 if (line.length() > 1) { 068 lineInfo = line.substring(2, line.length()).trim(); 069 lineIdentifier = line.substring(0, 2); 070 if (lineIdentifier.equals("ID")) 071 emblRecord.setEmblId(populateID(lineInfo)); 072 else if (lineIdentifier.equals("AC")) 073 populateAccessionNumber(line, accessionNumber); 074 else if (lineIdentifier.equals("DT") && line.contains("Created")) 075 emblRecord.setCreatedDate(lineInfo); 076 else if (lineIdentifier.equals("DT") && line.contains("updated")) 077 emblRecord.setLastUpdatedDate(lineInfo); 078 else if (lineIdentifier.equals("DE")) 079 emblRecord.setSequenceDescription(lineInfo); 080 else if (lineIdentifier.equals("KW")) 081 keyword.add(lineInfo); 082 else if (lineIdentifier.equals("OS")) 083 emblRecord.setOrganismSpecies(lineInfo); 084 else if (lineIdentifier.equals("OC")) 085 emblRecord.setOrganismClassification(lineInfo); 086 else if (lineIdentifier.equals("OG")) 087 emblRecord.setOrGanelle(lineInfo); 088 else if (lineIdentifier.equals("RN") || lineIdentifier.equals("RP") 089 || lineIdentifier.equals("RX") || lineIdentifier.equals("RG") 090 || lineIdentifier.equals("RA") || lineIdentifier.equals("RT") 091 || lineIdentifier.equals("RL")) 092 populateEmblReferences(lineIdentifier, lineInfo, emblReference, emblReferences); 093 else if (lineIdentifier.equals("DR")) 094 emblRecord.setDatabaseCrossReference(lineInfo); 095 else if (lineIdentifier.equals("AH")) 096 emblRecord.setAssemblyHeader(lineInfo); 097 else if (lineIdentifier.equals("AS")) 098 emblRecord.setAssemblyInformation(lineInfo); 099 else if (lineIdentifier.equals("CO")) 100 emblRecord.setConstructedSequence(lineInfo); 101 else if (lineIdentifier.equals("FH")) 102 emblRecord.setFeatureHeader(lineInfo); 103 else if (lineIdentifier.equals("FT")) 104 emblRecord.setFeatureTable(lineInfo); 105 else if (lineIdentifier.equals("SQ")) 106 emblRecord.setSequenceHeader(lineInfo); 107 else if (lineIdentifier.equals(" ") && !lineIdentifier.equals("//")) 108 populateSequence(line, sequence); 109 else if (lineIdentifier.equals("//")) { 110 emblRecord.setKeyword(keyword); 111 emblRecord.setEmblReference(emblReferences); 112 emblRecord.setAccessionNumber(accessionNumber); 113 emblRecord.setSequence(sequence.toString()); 114 } 115 116 } 117 } 118 } 119 } 120 121 return emblRecord; 122 } 123 124 private static void populateSequence(String line, StringBuilder sequence) { 125 String sequenceLine = line.replace(" ", ""). 126 replaceAll("[0-9]", ""); 127 sequence.append(sequenceLine); 128 } 129 130 private static void populateEmblReferences(String lineIdentifier, String lineInfo, EmblReference emblReference 131 , LinkedList<EmblReference> emblReferences) { 132 if (lineIdentifier.equals("RN")) 133 emblReference.setReferenceNumber(lineInfo); 134 else if (lineIdentifier.equals("RP")) 135 emblReference.setReferencePosition(lineInfo); 136 else if (lineIdentifier.equals("RX")) 137 emblReference.setReferenceCrossReference(lineInfo); 138 else if (lineIdentifier.equals("RG")) 139 emblReference.setReferenceGroup(lineInfo); 140 else if (lineIdentifier.equals("RA")) 141 emblReference.setReferenceAuthor(lineInfo); 142 else if (lineIdentifier.equals("RT")) 143 emblReference.setReferenceTitle(lineInfo); 144 else if (lineIdentifier.equals("RL")) { 145 emblReference.setReferenceLocation(lineInfo); 146 emblReferences.add(emblReference.copyEmblReference(emblReference)); 147 } 148 } 149 150 private static void populateAccessionNumber(String line, LinkedList<String> accessionNumber) { 151 accessionNumber.add(line); 152 } 153 154 private static EmblId populateID(String line) { 155 String[] strings = line.split(";"); 156 Arrays.stream(strings).map(String::trim).toArray(unused -> strings); 157 EmblId emblId = new EmblId(strings[0], strings[1], strings[2] 158 , strings[3], strings[4], strings[5], strings[6]); 159 return emblId; 160 } 161 162 163}