Coursera bioinformaatika kursus

Väga lahe kursus, isegi kui mingil hetkel frustratsioon võimust võtab ja edasi ei taha asjad liikuda, on ikkagi põnev kursus. Sest neil on mass spectromeeter, DNA ja muud lahedad asjad, millest mina midagi ei tea. Molekulaarbioloogia, proteiinid jne.

Mul on mingid koodijupid, mis seal testides läbi läksid. Cheatsheet! Kunagi ei tea, millal on vaja RNA-d tõlkida või üldse veidraid mustreid otsida. Ei läbinud kursust ise täielikult, aga teeks heameelega uuesti. Hea on lugeda seda koodi, sest näiteks Assert fail “wtf” näitab, et päris frustreeritud olin lõpuks. Siis ostsin omale selle raamatu. Duh, langesin ohvriks.

 


List<int> GeneratingTheoreticalSpectrumProblem(string peptide)
{
StringReader str = new StringReader(Resource1.Masses);
Dictionary<string, string> d = new Dictionary<string, string>();
string t = "";
while ((t = str.ReadLine()) != null)
{
string[] a = t.Split(' ');
d.Add(a[0], a[1]);
}
List<int> ret = new List<int>();

ret.Add(0);

List<string> l = GenerateAllSubpeptides(peptide);
l.Add(peptide);
for (int i = 0; i < l.Count(); i++)
{
int mass = 0;
for (int j = 0; j < l[i].Length; j++)
{
mass += Int32.Parse(d[l[i][j].ToString()]);
}
ret.Add(mass);
}
ret.Sort();

return ret;
}

void GenSubpeptides(string n, List<string> l)
{
//don't need a list at all actually!
int k = n.Length - 1;
int rep = k - 1;
int hits = 0;
for (; k >= 1; k--)
{
rep = 0;
Node node = ToLinkedList(n);
while (0==0) //while node.idx != rep the second time!
{
if (node.Idx == rep)
{
hits++;
if (hits == 2)
{
hits = 0;
break;
}
}
string sub = "";
for (int i = 0; i < k; i++)
{
sub += node.Text;
node = node.Next;
}
for (int i = 0; i < n.Length - k + 1; i++)
{
node = node.Next;
}

l.Add(sub);
}
}

}
List<string> GenerateAllSubpeptides(string peptide)
{
List<string> ret = new List<string>();
//need duplicates also!
Node n = ToLinkedList(peptide);
GenSubpeptides(peptide, ret);
return ret;
}


public List<string> PeptideEncodingProblem(string DNA, string peptide)
{
//We say that a DNA string Pattern encodes an amino acid
//string Peptide if the RNA string transcribed from either
//Pattern or its reverse complement Pattern translates into Peptide.
//[out] all substrings of text encoding given peptide

int window = peptide.Length * 3;
string rna;
string rna_rev;
List<string> ret = new List<string>();
string subs = "";
for (int i = 0; i < DNA.Length; i++)
{
try
{
subs = DNA.Substring(i, window);
}
catch { break; }
rna = Transcribe(subs);
rna_rev = Transcribe(ReverseComplement(subs));
if (peptide == Translate(rna) || peptide == Translate(rna_rev))
ret.Add(subs);
}
return ret;
}

public string Transcribe(string DNA)
{
StringBuilder rna = new StringBuilder();
for (int i = 0; i < DNA.Length; i++)
{
if (DNA[i] == 'T')
rna.Append('U');
else
rna.Append(DNA[i]);
}
return rna.ToString();
}

public string Translate(string RNA)
{
//translates into amino acid strings
StringReader str = new StringReader(Resource1.Table1);
Dictionary<string, string> d = new Dictionary<string, string>();
string t = "";
while( (t = str.ReadLine()) != null)
{
string[] a = t.Split(' ');
d.Add(a[0], a[1]);
}
// value "" means stopcodon
string translated = "";

for (int i = 0; i < RNA.Length; i+=3)
{
string subs;
try
{
subs = RNA.Substring(i, 3);
}
catch (Exception e)
{
break;
}
translated += d[subs];
}
return translated;
}

List<string> FreqWordsWithMismatchesAndRevComplements(string text, int k, int d)
{
//grab all k-mers
List<string> kmers = FindAtLeastFreqKMers(text, k, 1);

int max = 0;
Dictionary<string, int> dict = new Dictionary<string, int>();
foreach (string s in kmers)
{
List<int> pm = ApproxPatternMatch(text, s, d);
List<int> pm2 = ApproxPatternMatch(text, ReverseComplement(s), d);
int count = pm.Count + pm2.Count;
if (count >= max)
{
max = count;
dict.Add(s, max);
}
}
List<string> list = new List<string>();
foreach (KeyValuePair<string, int> kv in dict)
{
if (kv.Value == max)
{
list.Add(kv.Key);
list.Add(ReverseComplement(kv.Key));
}
}
return list;
}

void GetMismatchFills(int id, List<char[]> a, char[] arr)
{
if (id == arr.Length)
{
a.Add((char[])arr.Clone());
return;
}
char[] alph = { 'A', 'C', 'G', 'T' };

for (int i = 0; i < alph.Count(); i++)
{
arr[id] = alph[i];
GetMismatchFills(id+1, a, arr);
}

}

List<string> FrequentWordsWithMismatches(string text, int k, int d)
{
//k -kmer size, d - up to d mismatches in text
//find the most frequent k-mer with up to d mismatches

//grab all k-mers
List<string> kmers = FindAtLeastFreqKMers(text, k, 1);
List<string> kmers_m = new List<string>();//mismatching kmers
int max = 0;
Dictionary<string, int> dict = new Dictionary<string, int>();
kmers_m = kmers;// kmers_m.Distinct().ToList();//
foreach(string s in kmers_m)
{
List<int> pm = ApproxPatternMatch(text, s, d);
if (pm.Count() >= max)
{
max = pm.Count;
dict.Add(s, max);
}
}
List<string> list = new List<string>();
foreach(KeyValuePair<string,int> kv in dict)
{
if (kv.Value == max)
list.Add(kv.Key);
}
return list;
}

List<int[]> GetMismatchPositions(string pattern, int d)
{
//returns mismatch positions, if atmost d mismatch is needed then
//when using the trie it is likely only the d variations are needed
int[] arr = new int[pattern.Length];
List<int[]> lis = new List<int[]>();
for(int i = 0; i < pattern.Length;i++)
{
arr[i] = i;
}
Variations<int> var = new Variations<int>(arr, d);
foreach (IList<int> v in var)
{
int[] a = v.ToArray();
lis.Add(a);
}
return lis;
}

List<int> ApproxPatternMatch(string text, string pattern, int d)
{
//return all positions where Pattern appears in Text with at most
//d mismatches
Dictionary<string, int[]> result = new Dictionary<string, int[]>();
string subs = "";
int k = pattern.Length;
for (int i = 0; i < text.Length; i++)
{

if ((i + k) > text.Length)
break;
subs = "";
for (int j = i; j < i + k; j++)
subs += text[j];
if (!result.ContainsKey(subs))
{
if (subs.Contains(" ") || subs.Contains("\r") || subs.Contains("\n"))
throw new Exception();
System.Diagnostics.Debug.Assert(subs.Length == k, "wtf");
result.Add(subs, new int[] { i });
}
else
{
int[] prev = result[subs];
int[] nu = new int[prev.Length + 1];
Array.Copy(prev, nu, prev.Length);
nu[prev.Length] = i;
result[subs] = nu;

}
}
TrieCompressed<string> trie = new TrieCompressed<string>();
List<int[]> l = GetMismatchPositions(pattern, d);//int[] has d objects
foreach(KeyValuePair<string,int[]> kv in result)
{
trie.Insert(kv.Key, kv.Key);
}
List<string> ret = new List<string>();
for (int i = 0; i < l.Count; i++)
{
char[] pat2 = new char[pattern.Length];

for (int j = 0; j < d; j++)
{
pat2[l[i][j]] = '.';
}
for (int j = 0; j < pat2.Length; j++)
{
if (pat2[j] != '.')
pat2[j] = pattern[j];
}
StringBuilder sb = new StringBuilder();
sb.Append(pat2);
ret.AddRange(trie.Partial(sb.ToString()));
}
List<int> positions = new List<int>();
for (int i = 0; i < ret.Count; i++)
{

positions.AddRange( result[ret[i]]);
}
return positions.Distinct().ToList();
}


//g-c
List<int> MinimumSkew(string genome)
{
int g = 0;
int c = 0;
int g_c = 0;
int[] skew = new int[genome.Length];
List<int> minimizing = new List<int>();
for (int i = 0; i < genome.Length; i++)
{
if (genome[i] == 'G')
{
g++;
}
else
if (genome[i] == 'C')
{
c++;
}
skew[i] = g - c;
}
int min = 10000;
for (int i = 0; i < genome.Length; i++)
{
if (skew[i] < min)
{
min = skew[i];
}
}
for (int i = 0; i < genome.Length; i++)
{
if (skew[i] == min)
minimizing.Add(i);
}
return minimizing;
}

string ReverseComplement(string input)
{
char[] c = input.Reverse().ToArray();

for (int i = 0; i < c.Count(); i++)
{
if (c[i] == 'A')
c[i] = 'T';
else
if (c[i] == 'T')
c[i] = 'A';
else
if (c[i] == 'C')
c[i] = 'G';
else
if (c[i] == 'G')
c[i] = 'C';
}
StringBuilder sb = new StringBuilder();
sb.Append(c);
return sb.ToString();
}

public Dictionary<string, int> FindFreqKMers(string text, int k)
{
Dictionary<string, int> result = new Dictionary<string, int>();
string subs = "";
for (int i = 0; i < text.Length; i++)
{

if ((i + k) > text.Length)
break;
subs = "";
for (int j = i; j < i + k; j++)
subs += text[j];
if (!result.ContainsKey(subs))
{
if (subs.Contains(" ") || subs.Contains("\r") || subs.Contains("\n"))
throw new Exception();
System.Diagnostics.Debug.Assert(subs.Length == k, "wtf");
result.Add(subs, 1);
}
else
result[subs]++;
}
int max = 0;
for (int i = 0; i < result.Count; i++)
{
if (result.ElementAt(i).Value > max)
max = result.ElementAt(i).Value;
}
Dictionary<string, int> r = new Dictionary<string, int>();
for (int i = 0; i < result.Count; i++)
{
if (result.ElementAt(i).Value == max)
r.Add(result.ElementAt(i).Key, result.ElementAt(i).Value);
}
return r;
}

int[] GetSubArray(int[] arr, int start, int length)
{
int[] arrb = new int[length];
int j = 0;
try
{
for (int i = start; i < start + length; i++)
{
arrb[j++] = arr[i];
}
}
catch (IndexOutOfRangeException e)
{
throw new ArgumentOutOfRangeException();
}
return arrb;
}

List<string> CyclopeptideSequencing(string spectrum)
{
//a linear peptide is consistent with Spectrum if
//every mass in its theoretical spectrum is contained
//in Spectrum
//however, the theoretical spectrum of the linear peptide
//NQEL, shown below, does not contain masses corresponding
//to LN, LNQ, or ELN, since these subpeptides “wrap around”
List<string> ret = new List<string>();//peptides as masses
StringReader str = new StringReader(Resource1.Masses);
Dictionary<string, string> d = new Dictionary<string, string>();
string t = "";
while ((t = str.ReadLine()) != null)
{
string[] a = t.Split(' ');
d.Add(a[0], a[1]);
}
List<int> masses = new List<int>();
Dictionary<string, string> mass_to_pep = new Dictionary<string, string>();
string[] mass_to_pe = new string[187];
foreach (KeyValuePair<string, string> kv in d)
{
if (!masses.Contains(Int32.Parse(kv.Value)))
masses.Add(Int32.Parse(kv.Value));
if(!mass_to_pep.ContainsKey(kv.Value))
mass_to_pep.Add(kv.Value, kv.Key);//BUG
mass_to_pe[Int32.Parse(kv.Value)] = kv.Key;
}
string[] numb = spectrum.Split(' ');
int[] spec_in = new int[numb.Length];
for (int i = 0; i < numb.Length; i++)
{
spec_in[i] = Int32.Parse(numb[i]);
}
LinkedList<int[]> list = new LinkedList<int[]>();//so no diff if it is linked or not
list.AddFirst(new int[] { 0 });
List<int> si = new List<int>();
si.AddRange(spec_in);
List<int> rem = new List<int>();
while (list.Count() > 0)
{
list = Expand(list.ToList(), masses);
LinkedListNode<int[]> start = list.First;
LinkedListNode<int[]> last = list.Last;
LinkedListNode<int[]> temp = new LinkedListNode<int[]>(null);
while(start != null)
{
temp = start.Next;
int[] peptide = start.Value;
string pep = "";
for (int j = 0; j < peptide.Length; j++)
{
if (peptide[j] != 0)
pep += mass_to_pe[peptide[j]];
}
List<int> theoretical_nonlinear = GeneratingTheoreticalSpectrumProblem(pep);
List<int> theoretical_linear = GenerateLinearTheoreticalSpectrum(peptide);
//compare
int count = 0;
if (spec_in.Length != theoretical_nonlinear.Count)
{
goto Label1;
}

for (int j = 0; j < spec_in.Length; j++)
{
//assuming sorted
if (theoretical_nonlinear[j] != spec_in[j])
break;
else
++count;
}
Label1:
if (count == spec_in.Length)
{
//output
string s ="";
for (int j = 1; j < peptide.Length; j++)
{
s += peptide[j].ToString() + "-";
}
s = s.Substring(0, s.Length - 1);
ret.Add(s);
list.Remove(start);
}
else
{
Dictionary<int, int> visited = new Dictionary<int, int>();

for (int j = 0; j < theoretical_linear.Count; j++)
{
int a = si.BinarySearch(theoretical_linear[j]);
if (visited.ContainsKey(a))
{
int s = visited[a];
visited[a]++;
if (si[a + s] != theoretical_linear[j])
{
list.Remove(start);
//list[i] = null;
break;
}
}
else
visited.Add(a, 1);
if (a < 0)
{
list.Remove(start);
break;
}

}
}
start = temp;
}
}
ret.Reverse();
return ret;
}

LinkedList<int[]> Expand(List<int[]> list, List<int> masses)
{
List<int[]> newlist = new List<int[]>();
for (int i = 0; i < list.Count(); i++)
{
for (int j = 0; j < masses.Count(); j++)
{
List<int> n = new List<int>();
n.AddRange(list[i]);
n.Add(masses[j]);
newlist.Add(n.ToArray());
}
}
LinkedList<int[]> l = new LinkedList<int[]>();
if (newlist.Count > 0)
l.AddFirst(newlist[0]);
LinkedListNode<int[]> start = l.First;
for(int i =1; i < newlist.Count;i++)
{
l.AddAfter(start,new LinkedListNode<int[]>(newlist[i]));
start = start.Next;
}
return l;
}

/// <summary>
/// More formally, given integers L and t, a k-mer Pattern forms an (L, t)-clump
/// inside a (larger) string Genome if there is an interval of Genome
/// of length L in which this k-mer appears at least t times.
/// </summary>
/// <param name="genome"></param>
/// <param name="windowSize"></param>
/// <param name="atLeastFreq"></param>
/// <returns></returns>
List<string> FindClumps(string genome, int k, int windowSize, int atLeastFreq)
{
//k - size of k-mer
List<string> lis = new List<string>();
List<string> atl = new List<string>();
string substr;
for (int i = 0; i < genome.Length; i++)
{
try
{
substr = genome.Substring(i, windowSize);
}
catch (ArgumentOutOfRangeException ex)
{
break;
}
atl.AddRange(FindAtLeastFreqKMers(substr, k, atLeastFreq));
}
lis = atl.Distinct().ToList();
return lis;
}

public List<string> FindAtLeastFreqKMers(string text, int k, int atLeastFreq)
{
Dictionary<string, int> result = new Dictionary<string, int>();
string subs = "";
for (int i = 0; i < text.Length; i++)
{

if ((i + k) > text.Length)
break;
subs = "";
for (int j = i; j < i + k; j++)
subs += text[j];
if (!result.ContainsKey(subs))
{
if (subs.Contains(" ") || subs.Contains("\r") || subs.Contains("\n"))
throw new Exception();
System.Diagnostics.Debug.Assert(subs.Length == k, "wtf");
result.Add(subs, 1);
}
else
result[subs]++;
}
Dictionary<string, int> r = new Dictionary<string, int>();
List<string> list = new List<string>();
for (int i = 0; i < result.Count; i++)
{
if (result.ElementAt(i).Value >= atLeastFreq)
list.Add(result.ElementAt(i).Key);
}
return list;
}

List<int> GenerateLinearTheoreticalSpectrum(int[] peptide)
{
List<int> spectrum = new List<int>();
int[] sub;
int w = 0;
while (w < peptide.Length)
{
w++;
for (int i = 1; i < peptide.Length; i++)
{
try
{
sub = GetSubArray(peptide, i, w);
spectrum.Add(sub.ToList().Sum());
}
catch
{
break;
List<int> GeneratingTheoreticalSpectrumProblem(string peptide)
{
StringReader str = new StringReader(Resource1.Masses);
Dictionary<string, string> d = new Dictionary<string, string>();
string t = "";
while ((t = str.ReadLine()) != null)
{
string[] a = t.Split(' ');
d.Add(a[0], a[1]);
}
List<int> ret = new List<int>();

ret.Add(0);

List<string> l = GenerateAllSubpeptides(peptide);
l.Add(peptide);
for (int i = 0; i < l.Count(); i++)
{
int mass = 0;
for (int j = 0; j < l[i].Length; j++)
{
mass += Int32.Parse(d[l[i][j].ToString()]);
}
ret.Add(mass);
}
ret.Sort();

return ret;
}

void GenSubpeptides(string n, List<string> l)
{
//don't need a list at all actually!
int k = n.Length - 1;
int rep = k - 1;
int hits = 0;
for (; k >= 1; k--)
{
rep = 0;
Node node = ToLinkedList(n);
while (0==0) //while node.idx != rep the second time!
{
if (node.Idx == rep)
{
hits++;
if (hits == 2)
{
hits = 0;
break;
}
}
string sub = "";
for (int i = 0; i < k; i++)
{
sub += node.Text;
node = node.Next;
}
for (int i = 0; i < n.Length - k + 1; i++)
{
node = node.Next;
}

l.Add(sub);
}
}

}
List<string> GenerateAllSubpeptides(string peptide)
{
List<string> ret = new List<string>();
//need duplicates also!
Node n = ToLinkedList(peptide);
GenSubpeptides(peptide, ret);
return ret;
}


public List<string> PeptideEncodingProblem(string DNA, string peptide)
{
//We say that a DNA string Pattern encodes an amino acid
//string Peptide if the RNA string transcribed from either
//Pattern or its reverse complement Pattern translates into Peptide.
//[out] all substrings of text encoding given peptide

int window = peptide.Length * 3;
string rna;
string rna_rev;
List<string> ret = new List<string>();
string subs = "";
for (int i = 0; i < DNA.Length; i++)
{
try
{
subs = DNA.Substring(i, window);
}
catch { break; }
rna = Transcribe(subs);
rna_rev = Transcribe(ReverseComplement(subs));
if (peptide == Translate(rna) || peptide == Translate(rna_rev))
ret.Add(subs);
}
return ret;
}

public string Transcribe(string DNA)
{
StringBuilder rna = new StringBuilder();
for (int i = 0; i < DNA.Length; i++)
{
if (DNA[i] == 'T')
rna.Append('U');
else
rna.Append(DNA[i]);
}
return rna.ToString();
}

public string Translate(string RNA)
{
//translates into amino acid strings
StringReader str = new StringReader(Resource1.Table1);
Dictionary<string, string> d = new Dictionary<string, string>();
string t = "";
while( (t = str.ReadLine()) != null)
{
string[] a = t.Split(' ');
d.Add(a[0], a[1]);
}
// value "" means stopcodon
string translated = "";

for (int i = 0; i < RNA.Length; i+=3)
{
string subs;
try
{
subs = RNA.Substring(i, 3);
}
catch (Exception e)
{
break;
}
translated += d[subs];
}
return translated;
}

List<string> FreqWordsWithMismatchesAndRevComplements(string text, int k, int d)
{
//grab all k-mers
List<string> kmers = FindAtLeastFreqKMers(text, k, 1);

int max = 0;
Dictionary<string, int> dict = new Dictionary<string, int>();
foreach (string s in kmers)
{
List<int> pm = ApproxPatternMatch(text, s, d);
List<int> pm2 = ApproxPatternMatch(text, ReverseComplement(s), d);
int count = pm.Count + pm2.Count;
if (count >= max)
{
max = count;
dict.Add(s, max);
}
}
List<string> list = new List<string>();
foreach (KeyValuePair<string, int> kv in dict)
{
if (kv.Value == max)
{
list.Add(kv.Key);
list.Add(ReverseComplement(kv.Key));
}
}
return list;
}

void GetMismatchFills(int id, List<char[]> a, char[] arr)
{
if (id == arr.Length)
{
a.Add((char[])arr.Clone());
return;
}
char[] alph = { 'A', 'C', 'G', 'T' };

for (int i = 0; i < alph.Count(); i++)
{
arr[id] = alph[i];
GetMismatchFills(id+1, a, arr);
}

}

List<string> FrequentWordsWithMismatches(string text, int k, int d)
{
//k -kmer size, d - up to d mismatches in text
//find the most frequent k-mer with up to d mismatches

//grab all k-mers
List<string> kmers = FindAtLeastFreqKMers(text, k, 1);
List<string> kmers_m = new List<string>();//mismatching kmers
int max = 0;
Dictionary<string, int> dict = new Dictionary<string, int>();
kmers_m = kmers;// kmers_m.Distinct().ToList();//
foreach(string s in kmers_m)
{
List<int> pm = ApproxPatternMatch(text, s, d);
if (pm.Count() >= max)
{
max = pm.Count;
dict.Add(s, max);
}
}
List<string> list = new List<string>();
foreach(KeyValuePair<string,int> kv in dict)
{
if (kv.Value == max)
list.Add(kv.Key);
}
return list;
}

List<int[]> GetMismatchPositions(string pattern, int d)
{
//returns mismatch positions, if atmost d mismatch is needed then
//when using the trie it is likely only the d variations are needed
int[] arr = new int[pattern.Length];
List<int[]> lis = new List<int[]>();
for(int i = 0; i < pattern.Length;i++)
{
arr[i] = i;
}
Variations<int> var = new Variations<int>(arr, d);
foreach (IList<int> v in var)
{
int[] a = v.ToArray();
lis.Add(a);
}
return lis;
}

List<int> ApproxPatternMatch(string text, string pattern, int d)
{
//return all positions where Pattern appears in Text with at most
//d mismatches
Dictionary<string, int[]> result = new Dictionary<string, int[]>();
string subs = "";
int k = pattern.Length;
for (int i = 0; i < text.Length; i++)
{

if ((i + k) > text.Length)
break;
subs = "";
for (int j = i; j < i + k; j++)
subs += text[j];
if (!result.ContainsKey(subs))
{
if (subs.Contains(" ") || subs.Contains("\r") || subs.Contains("\n"))
throw new Exception();
System.Diagnostics.Debug.Assert(subs.Length == k, "wtf");
result.Add(subs, new int[] { i });
}
else
{
int[] prev = result[subs];
int[] nu = new int[prev.Length + 1];
Array.Copy(prev, nu, prev.Length);
nu[prev.Length] = i;
result[subs] = nu;

}
}
TrieCompressed<string> trie = new TrieCompressed<string>();
List<int[]> l = GetMismatchPositions(pattern, d);//int[] has d objects
foreach(KeyValuePair<string,int[]> kv in result)
{
trie.Insert(kv.Key, kv.Key);
}
List<string> ret = new List<string>();
for (int i = 0; i < l.Count; i++)
{
char[] pat2 = new char[pattern.Length];

for (int j = 0; j < d; j++)
{
pat2[l[i][j]] = '.';
}
for (int j = 0; j < pat2.Length; j++)
{
if (pat2[j] != '.')
pat2[j] = pattern[j];
}
StringBuilder sb = new StringBuilder();
sb.Append(pat2);
ret.AddRange(trie.Partial(sb.ToString()));
}
List<int> positions = new List<int>();
for (int i = 0; i < ret.Count; i++)
{

positions.AddRange( result[ret[i]]);
}
return positions.Distinct().ToList();
}


//g-c
List<int> MinimumSkew(string genome)
{
int g = 0;
int c = 0;
int g_c = 0;
int[] skew = new int[genome.Length];
List<int> minimizing = new List<int>();
for (int i = 0; i < genome.Length; i++)
{
if (genome[i] == 'G')
{
g++;
}
else
if (genome[i] == 'C')
{
c++;
}
skew[i] = g - c;
}
int min = 10000;
for (int i = 0; i < genome.Length; i++)
{
if (skew[i] < min)
{
min = skew[i];
}
}
for (int i = 0; i < genome.Length; i++)
{
if (skew[i] == min)
minimizing.Add(i);
}
return minimizing;
}

string ReverseComplement(string input)
{
char[] c = input.Reverse().ToArray();

for (int i = 0; i < c.Count(); i++)
{
if (c[i] == 'A')
c[i] = 'T';
else
if (c[i] == 'T')
c[i] = 'A';
else
if (c[i] == 'C')
c[i] = 'G';
else
if (c[i] == 'G')
c[i] = 'C';
}
StringBuilder sb = new StringBuilder();
sb.Append(c);
return sb.ToString();
}

public Dictionary<string, int> FindFreqKMers(string text, int k)
{
Dictionary<string, int> result = new Dictionary<string, int>();
string subs = "";
for (int i = 0; i < text.Length; i++)
{

if ((i + k) > text.Length)
break;
subs = "";
for (int j = i; j < i + k; j++)
subs += text[j];
if (!result.ContainsKey(subs))
{
if (subs.Contains(" ") || subs.Contains("\r") || subs.Contains("\n"))
throw new Exception();
System.Diagnostics.Debug.Assert(subs.Length == k, "wtf");
result.Add(subs, 1);
}
else
result[subs]++;
}
int max = 0;
for (int i = 0; i < result.Count; i++)
{
if (result.ElementAt(i).Value > max)
max = result.ElementAt(i).Value;
}
Dictionary<string, int> r = new Dictionary<string, int>();
for (int i = 0; i < result.Count; i++)
{
if (result.ElementAt(i).Value == max)
r.Add(result.ElementAt(i).Key, result.ElementAt(i).Value);
}
return r;
}

int[] GetSubArray(int[] arr, int start, int length)
{
int[] arrb = new int[length];
int j = 0;
try
{
for (int i = start; i < start + length; i++)
{
arrb[j++] = arr[i];
}
}
catch (IndexOutOfRangeException e)
{
throw new ArgumentOutOfRangeException();
}
return arrb;
}

List<string> CyclopeptideSequencing(string spectrum)
{
//a linear peptide is consistent with Spectrum if
//every mass in its theoretical spectrum is contained
//in Spectrum
//however, the theoretical spectrum of the linear peptide
//NQEL, shown below, does not contain masses corresponding
//to LN, LNQ, or ELN, since these subpeptides “wrap around”
List<string> ret = new List<string>();//peptides as masses
StringReader str = new StringReader(Resource1.Masses);
Dictionary<string, string> d = new Dictionary<string, string>();
string t = "";
while ((t = str.ReadLine()) != null)
{
string[] a = t.Split(' ');
d.Add(a[0], a[1]);
}
List<int> masses = new List<int>();
Dictionary<string, string> mass_to_pep = new Dictionary<string, string>();
string[] mass_to_pe = new string[187];
foreach (KeyValuePair<string, string> kv in d)
{
if (!masses.Contains(Int32.Parse(kv.Value)))
masses.Add(Int32.Parse(kv.Value));
if(!mass_to_pep.ContainsKey(kv.Value))
mass_to_pep.Add(kv.Value, kv.Key);//BUG
mass_to_pe[Int32.Parse(kv.Value)] = kv.Key;
}
string[] numb = spectrum.Split(' ');
int[] spec_in = new int[numb.Length];
for (int i = 0; i < numb.Length; i++)
{
spec_in[i] = Int32.Parse(numb[i]);
}
LinkedList<int[]> list = new LinkedList<int[]>();//so no diff if it is linked or not
list.AddFirst(new int[] { 0 });
List<int> si = new List<int>();
si.AddRange(spec_in);
List<int> rem = new List<int>();
while (list.Count() > 0)
{
list = Expand(list.ToList(), masses);
LinkedListNode<int[]> start = list.First;
LinkedListNode<int[]> last = list.Last;
LinkedListNode<int[]> temp = new LinkedListNode<int[]>(null);
while(start != null)
{
temp = start.Next;
int[] peptide = start.Value;
string pep = "";
for (int j = 0; j < peptide.Length; j++)
{
if (peptide[j] != 0)
pep += mass_to_pe[peptide[j]];
}
List<int> theoretical_nonlinear = GeneratingTheoreticalSpectrumProblem(pep);
List<int> theoretical_linear = GenerateLinearTheoreticalSpectrum(peptide);
//compare
int count = 0;
if (spec_in.Length != theoretical_nonlinear.Count)
{
goto Label1;
}

for (int j = 0; j < spec_in.Length; j++)
{
//assuming sorted
if (theoretical_nonlinear[j] != spec_in[j])
break;
else
++count;
}
Label1:
if (count == spec_in.Length)
{
//output
string s ="";
for (int j = 1; j < peptide.Length; j++)
{
s += peptide[j].ToString() + "-";
}
s = s.Substring(0, s.Length - 1);
ret.Add(s);
list.Remove(start);
}
else
{
Dictionary<int, int> visited = new Dictionary<int, int>();

for (int j = 0; j < theoretical_linear.Count; j++)
{
int a = si.BinarySearch(theoretical_linear[j]);
if (visited.ContainsKey(a))
{
int s = visited[a];
visited[a]++;
if (si[a + s] != theoretical_linear[j])
{
list.Remove(start);
//list[i] = null;
break;
}
}
else
visited.Add(a, 1);
if (a < 0)
{
list.Remove(start);
break;
}

}
}
start = temp;
}
}
ret.Reverse();
return ret;
}

LinkedList<int[]> Expand(List<int[]> list, List<int> masses)
{
List<int[]> newlist = new List<int[]>();
for (int i = 0; i < list.Count(); i++)
{
for (int j = 0; j < masses.Count(); j++)
{
List<int> n = new List<int>();
n.AddRange(list[i]);
n.Add(masses[j]);
newlist.Add(n.ToArray());
}
}
LinkedList<int[]> l = new LinkedList<int[]>();
if (newlist.Count > 0)
l.AddFirst(newlist[0]);
LinkedListNode<int[]> start = l.First;
for(int i =1; i < newlist.Count;i++)
{
l.AddAfter(start,new LinkedListNode<int[]>(newlist[i]));
start = start.Next;
}
return l;
}

/// <summary>
/// More formally, given integers L and t, a k-mer Pattern forms an (L, t)-clump
/// inside a (larger) string Genome if there is an interval of Genome
/// of length L in which this k-mer appears at least t times.
/// </summary>
/// <param name="genome"></param>
/// <param name="windowSize"></param>
/// <param name="atLeastFreq"></param>
/// <returns></returns>
List<string> FindClumps(string genome, int k, int windowSize, int atLeastFreq)
{
//k - size of k-mer
List<string> lis = new List<string>();
List<string> atl = new List<string>();
string substr;
for (int i = 0; i < genome.Length; i++)
{
try
{
substr = genome.Substring(i, windowSize);
}
catch (ArgumentOutOfRangeException ex)
{
break;
}
atl.AddRange(FindAtLeastFreqKMers(substr, k, atLeastFreq));
}
lis = atl.Distinct().ToList();
return lis;
}

public List<string> FindAtLeastFreqKMers(string text, int k, int atLeastFreq)
{
Dictionary<string, int> result = new Dictionary<string, int>();
string subs = "";
for (int i = 0; i < text.Length; i++)
{

if ((i + k) > text.Length)
break;
subs = "";
for (int j = i; j < i + k; j++)
subs += text[j];
if (!result.ContainsKey(subs))
{
if (subs.Contains(" ") || subs.Contains("\r") || subs.Contains("\n"))
throw new Exception();
System.Diagnostics.Debug.Assert(subs.Length == k, "wtf");
result.Add(subs, 1);
}
else
result[subs]++;
}
Dictionary<string, int> r = new Dictionary<string, int>();
List<string> list = new List<string>();
for (int i = 0; i < result.Count; i++)
{
if (result.ElementAt(i).Value >= atLeastFreq)
list.Add(result.ElementAt(i).Key);
}
return list;
}

List<int> GenerateLinearTheoreticalSpectrum(int[] peptide)
{
List<int> spectrum = new List<int>();
int[] sub;
int w = 0;
while (w < peptide.Length)
{
w++;
for (int i = 1; i < peptide.Length; i++)
{
try
{
sub = GetSubArray(peptide, i, w);
spectrum.Add(sub.ToList().Sum());
}
catch
{
break;
}
}

}
spectrum.Add(0);
return spectrum;
}                }
}

}
spectrum.Add(0);
return spectrum;
}

Advertisements