Linguistics 473
Computational Linguistics Fundamentals
Summer 2017

Project 4


Project Description

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Collections;

namespace _473Project4
{
    class Program
    {
        static void Main(string[] args)
        {
            Node top = CreateTrie(args[0]);
            //List trieSearches = top.PrintTrie();
            List matches = SearchForMatches(top, args[1]);
            Console.Out.Write(PrintByFile(matches));
            using (StreamWriter sw = new StreamWriter("extra-credit"))
                sw.WriteLine(PrintBySequence(matches));
            Console.In.ReadLine();
        }

        public static Node CreateTrie(String fileLocation)
        {
            Node top = new Node();
            String[] searches = File.ReadAllLines(fileLocation);
            foreach (String search in searches)
            {
                top.CreateString(search);
            }
            return top;
        }

        public static List SearchForMatches(Node searchTrie, String searchDirectory)
        {
            List foundGenes = new List();
            String[] searchFiles = Directory.GetFiles(searchDirectory);
            foreach (String filePath in searchFiles)
            {
                //Console.Out.WriteLine(filePath);
                Byte[] fullFile = File.ReadAllBytes(filePath);
                int topPosition = 0;
                while (topPosition < fullFile.Length)
                {
                    //if (topPosition % 5000000 == 0)
                    //    Console.Out.WriteLine(topPosition + "/" + fullFile.Length);
                    int foundEnd = topPosition;
                    if (searchTrie.Find(fullFile, topPosition, out foundEnd))
                    {
                        String foundString = Encoding.Default.GetString(fullFile, topPosition, foundEnd - topPosition);
                        foundGenes.Add(new GeneLocation(filePath, topPosition, foundString));
                    }
                    topPosition++;
                }
            }
            return foundGenes;
        }

        public static String PrintByFile(List gls)
        {
            Dictionary byFile = new Dictionary();
            foreach (GeneLocation gl in gls)
            {
                if (!byFile.ContainsKey(gl.File))
                    byFile[gl.File] = new StringBuilder();
                byFile[gl.File].Append("\t" + gl.Location.ToString("X8") + "\t" + gl.Sequence + "\n");
            }
            StringBuilder ret = new StringBuilder();
            List keys = byFile.Keys.ToList();
            keys.Sort();
            foreach (String key in keys)
                ret.Append(key + "\n" + byFile[key]);
            return ret.ToString();
        }

        public static String PrintBySequence(List gls)
        {
            Dictionary bySeq = new Dictionary();
            foreach (GeneLocation gl in gls)
            {
                String seq = gl.Sequence.ToUpper();
                if (!bySeq.ContainsKey(seq))
                    bySeq[seq] = new StringBuilder();
                bySeq[seq].Append("\t" + gl.Location.ToString("X8") + "\t" + gl.GetShortFilename() + "\n");
            }
            StringBuilder ret = new StringBuilder();
            foreach (String key in bySeq.Keys)
                ret.Append(key + "\n" + bySeq[key]);
            return ret.ToString();
        }

    }

    class Node
    {
        static int _A = 0;
        static int _G = 1;
        static int _C = 2;
        static int _T = 3;

        Node[] Children;

        public Node()
        {
            this.Children = null;
        }

        public bool isFinal()
        {
            return Children == null;
        }

        public bool CreateString(String s)
        {
            if (s == null || s == "")
                return true;
            Char val = s[0];
            if (Children == null)
                Children = new Node[4];
            switch (val)
            {
                case 'A':
                    if (Children[_A] == null)
                        Children[_A] = new Node();
                    return Children[_A].CreateString(s.Substring(1));
                case 'G':
                    if (Children[_G] == null)
                        Children[_G] = new Node();
                    return Children[_G].CreateString(s.Substring(1));
                case 'C':
                    if (Children[_C] == null)
                        Children[_C] = new Node();
                    return Children[_C].CreateString(s.Substring(1));
                case 'T':
                    if (Children[_T] == null)
                        Children[_T] = new Node();
                    return Children[_T].CreateString(s.Substring(1));
                default:
                    return false;
            }
        }
        public List PrintTrie()
        {
            List ret = new List();
            if (Children == null)
            {
                ret.Add("");
                return ret;
            }
            if (Children[_A] != null)
            {
                foreach (String subtree in Children[_A].PrintTrie())
                    ret.Add("A" + subtree);
            }
            if (Children[_C] != null)
            {
                foreach (String subtree in Children[_C].PrintTrie())
                    ret.Add("C" + subtree);
            }
            if (Children[_G] != null)
            {
                foreach (String subtree in Children[_G].PrintTrie())
                    ret.Add("G" + subtree);
            }
            if (Children[_T] != null)
            {
                foreach (String subtree in Children[_T].PrintTrie())
                    ret.Add("T" + subtree);
            }
            return ret;
        }

        public bool Find(Byte[] search, int current, out int end)
        {
            if (Children == null)
            {
                end = current;
                return true;
            }
            if (current >= search.Length)
            {
                end = -1;
                return false;
            }
            switch (search[current])
            {
                case (Byte)'A':
                case (Byte)'a':
                    if (Children[_A] == null)
                    {
                        end = -1;
                        return false;
                    }
                    else return Children[_A].Find(search, current + 1, out end);
                case (Byte)'G':
                case (Byte)'g':
                    if (Children[_G] == null)
                    {
                        end = -1;
                        return false;
                    }
                    else return Children[_G].Find(search, current + 1, out end);
                case (Byte)'T':
                case (Byte)'t':
                    if (Children[_T] == null)
                    {
                        end = -1;
                        return false;
                    }
                    else return Children[_T].Find(search, current + 1, out end);
                case (Byte)'C':
                case (Byte)'c':
                    if (Children[_C] == null)
                    {
                        end = -1;
                        return false;
                    }
                    else return Children[_C].Find(search, current + 1, out end);
                default: //should never reach this
                    end = -1;
                    return false;
            }
        }
    }

    class GeneLocation : IComparable
    {
        public String File
        {
            get;
            set;
        }
        public int Location
        {
            get;
            set;
        }
        public String Sequence
        {
            get;
            set;
        }

        public GeneLocation(String file, int loc, String seq)
        {
            this.File = file;
            this.Location = loc;
            this.Sequence = seq;
        }

        public String GetShortFilename()
        {
            int LastSlash = File.LastIndexOf('\\');
            if (LastSlash < 0)
                return File;
            else return File.Substring(LastSlash + 1);
        }

        public int CompareTo(Object obj)
        {
            GeneLocation other = (GeneLocation)obj;
            return this.Location - other.Location;
        }

    }

}
To compile on patas:
$ gmcs proj3.cs
$ mono proj3.exe /dropbox/17-18/473/project3/fsm-input.utf8.txt fsm-output.html