001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.cluster; 022 023import org.biojava.nbio.structure.*; 024import org.slf4j.Logger; 025import org.slf4j.LoggerFactory; 026 027import java.util.ArrayList; 028import java.util.Collections; 029import java.util.List; 030 031/** 032 * The SubunitExtractor extracts the information of each protein {@link Chain} 033 * in a {@link Structure} and converts them into a List of {@link Subunit}. 034 * 035 * @author Peter Rose 036 * @author Aleix Lafita 037 * @since 5.0.0 038 * 039 */ 040public class SubunitExtractor { 041 042 private static final Logger logger = LoggerFactory 043 .getLogger(SubunitExtractor.class); 044 045 /** Prevent instantiation **/ 046 private SubunitExtractor() { 047 } 048 049 /** 050 * Extract the information of each protein Chain in a Structure and converts 051 * them into a List of Subunit. The name of the Subunits is set to 052 * {@link Chain#getId()}. 053 * 054 * 055 * @param structure 056 * Structure object with protein Chains 057 * @param absMinLen 058 * {@link SubunitClustererParameters#getAbsoluteMinimumSequenceLength()} 059 * @param fraction 060 * {@link SubunitClustererParameters#getMinimumSequenceLengthFraction()} 061 * @param minLen 062 * {@link SubunitClustererParameters#getMinimumSequenceLength()} 063 * @return List of Subunits 064 */ 065 public static List<Subunit> extractSubunits(Structure structure, 066 int absMinLen, double fraction, int minLen) { 067 068 // The extracted subunit container 069 List<Subunit> subunits = new ArrayList<>(); 070 071 for (Chain c : structure.getPolyChains()) { 072 // Only take protein chains 073 if (c.isProtein()) { 074 Atom[] ca = StructureTools.getRepresentativeAtomArray(c); 075 logger.debug("Chain " + c.getId() + "; CA Atoms: " + ca.length + "; SEQRES: " + c.getSeqResSequence()); 076 if (ca.length==0) 077 continue; 078 subunits.add(new Subunit(ca, c.getId(), null, structure)); 079 } 080 } 081 082 // Calculate the minimum length of a Subunit 083 int adjustedMinLen = calcAdjustedMinimumSequenceLength(subunits, 084 absMinLen, fraction, minLen); 085 logger.debug("Adjusted minimum sequence length: {}", adjustedMinLen); 086 087 // Filter out short Subunits 088 for (int s = subunits.size() - 1; s >= 0; s--) { 089 if (subunits.get(s).size() < adjustedMinLen) 090 subunits.remove(s); 091 } 092 093 return subunits; 094 } 095 096 /** 097 * Returns an adapted minimum sequence length. This method ensure that 098 * structure that only have short chains are not excluded by the 099 * minimumSequenceLength cutoff value. 100 * 101 * @return adjustedMinimumSequenceLength 102 */ 103 private static int calcAdjustedMinimumSequenceLength( 104 List<Subunit> subunits, int absMinLen, double fraction, int minLen) { 105 106 int maxLength = Integer.MIN_VALUE; 107 int minLength = Integer.MAX_VALUE; 108 109 // Extract the length List, the min and the max 110 List<Integer> lengths = new ArrayList<>(); 111 for (int i = 0; i < subunits.size(); i++) { 112 if (subunits.get(i).size() >= absMinLen) { 113 maxLength = Math.max(subunits.get(i).size(), maxLength); 114 minLength = Math.min(subunits.get(i).size(), minLength); 115 lengths.add(subunits.get(i).size()); 116 117 } 118 } 119 120 int adjustedMinimumSequenceLength = minLen; 121 122 if (lengths.size() < 2) 123 return adjustedMinimumSequenceLength; 124 125 // Calculate the median of the lengths 126 double median = 0; 127 Collections.sort(lengths); 128 if (lengths.size() % 2 == 1) { 129 int middle = (lengths.size() - 1) / 2; 130 median = lengths.get(middle); 131 } else { 132 int middle2 = lengths.size() / 2; 133 int middle1 = middle2 - 1; 134 median = 0.5 * (lengths.get(middle1) + lengths.get(middle2)); 135 } 136 137 // If the median * fraction is lower than the minLength 138 if (minLength >= median * fraction) { 139 adjustedMinimumSequenceLength = Math.min(minLength, minLen); 140 } 141 142 return adjustedMinimumSequenceLength; 143 } 144}