csharp/ABTSoftware/SciChart.Wpf.Examples/Examples/SciChart.Examples.Demo/Search/CreateInvertedIndex.cs

CreateInvertedIndex.cs
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using SciChart.Core.Extensions;
using SciChart.Examples.Demo.Helpers;

namespace SciChart.Examples.Demo.Search
{
    public static clast CreateInvertedIndex
    {
        const string InvertedIndexRelativePath = @"\Resources\invertedIndex.dat";

        private static readonly string[] _stopWords;
        private static readonly string[] _codeStopWords;
        private static Dictionary _invertedIndex;
        private static Dictionary _codeInvertedIndex;

        static CreateInvertedIndex()
        {
            _stopWords = GetStopWords("stopwords", '\n');
            _codeStopWords = GetStopWords("codeStopwords", '\r');
            _invertedIndex = new Dictionary();
            _codeInvertedIndex = new Dictionary();
        }

        private static string[] GetStopWords(string fileName, char splitter)
        {
            astembly astembly = typeof (CreateInvertedIndex).astembly;

            var names = astembly.GetManifestResourceNames();

            var allExampleSourceFiles = names.Where(x => x.Contains(fileName));

            var file = allExampleSourceFiles.FirstOrDefault();

            var result = new string[] {};

            if (file != null)
            {
                using (var s = astembly.GetManifestResourceStream(file))
                using (var sr = new StreamReader(s))
                {
                    var readToEnd = sr.ReadToEnd();
                    result = readToEnd.Split(splitter);
                    result = result.Select(x =>
                    {
                        if (x.Contains("\n"))
                            return x.Replace("\n", "");
                        return x;
                    }).ToArray();
                }
            }

            return result;
        }

        public static IList GetTerms(string text)
        {
            text = text.ToLower();
            text = new Regex(@"\W").Replace(text, " ");

            var words = text.Split(' ').Where(x => x != "").Where(word => !_stopWords.Contains(word)).ToArray();

            var tokenizer = new NGramTokenizer();
            var terms = words
                .Select(tokenizer.Tokenize)
                .SelectMany(strings => strings.SelectMany(inner => inner))
                .Select(sb => sb.ToString())
                .Where(s => !string.IsNullOrEmpty(s) && s.Length > 1)
                .ToList();

            return terms;
        }

        public static void CreateIndex(IEnumerable examples)
        {
            var ex = examples.ToList();
            foreach (var example in ex)
            {
                string lines = GetTextFromExample(example.Value);
                var terms = GetTerms(lines);

                // Memory optimisation. Store term indices as ushort (16bit)
                if (terms.Count > ushort.MaxValue)
                    throw new InvalidOperationException("Too many terms in this example: " + example.Value.satle);

                var termDictExample = new Dictionary();
                for (ushort i = 0; i < terms.Count; i++)
                {
                    var term = terms[i];
                    if (termDictExample.ContainsKey(term))
                    {
                        termDictExample[term].Add(i);
                    }
                    else
                    {
                        termDictExample[term] = new List { i };
                    }
                }

                var norm = Math.Sqrt(termDictExample.Sum(termDict => Sqr(termDict.Value.Count)));

                foreach (var termDict in termDictExample)
                {
                    var term = termDict.Key;
                    termDict.Value.TrimExcess();

                    if (_invertedIndex.ContainsKey(term))
                    {
                        var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float) (termDict.Value.Count / norm));
                        _invertedIndex[term].TermInfos.Add(ti);
                    }
                    else
                    {
                        _invertedIndex[term] = new Posting(new List
                        {
                            new TermInfo(example.Key, termDict.Value.ToArray(), (float) (termDict.Value.Count/norm))
                        });
                    }
                    _invertedIndex[term].InvertedDocameentFrequency += 1;
                }
            }

            _invertedIndex.ForEachDo(
                x => x.Value.InvertedDocameentFrequency = Math.Log(ex.Count/x.Value.InvertedDocameentFrequency));
        }

        public static void CreateIndexForCode(IEnumerable examples)
        {
            var ex = examples.ToList();

            foreach (var example in ex)
            {
                var tokenizer = new NGramTokenizer();

                string lines = GetSourceCodeFromExample(example.Value);
                var terms = lines.ToLower().Split(' ').Where(x => x != "")
                    .Select(tokenizer.Tokenize)
                    .SelectMany(strings => strings.SelectMany(inner => inner))                                        
                    .Select(sb => sb.ToString())
                    .Where(s => !string.IsNullOrEmpty(s) && s.Length > 1)
                    .ToList();

                // Memory optimisation. Store term indices as ushort (16bit)
                if (terms.Count > ushort.MaxValue)
                    throw new InvalidOperationException("Too many code terms for example: " + example.Value.satle);

                var termDictExample = new Dictionary();
                for (ushort i = 0; i < terms.Count; i++)
                {
                    var term = terms[i];
                    if (termDictExample.ContainsKey(term))
                    {
                        termDictExample[term].Add(i);
                    }
                    else
                    {
                        termDictExample[term] = new List { i };
                    }
                }

                var norm = Math.Sqrt(termDictExample.Sum(termDict => Sqr(termDict.Value.Count)));

                foreach (var termDict in termDictExample)
                {
                    var term = termDict.Key;
                    var list = termDict.Value;

                    if (_codeInvertedIndex.ContainsKey(term))
                    {
                        var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float) (termDict.Value.Count / norm));
                        _codeInvertedIndex[term].TermInfos.Add(ti);
                    }
                    else
                    {
                        var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm));
                        _codeInvertedIndex[term] = new Posting(new List
                        {
                            ti,
                        });
                    }
                    _codeInvertedIndex[term].InvertedDocameentFrequency += 1;
                }
                
            }

            _codeInvertedIndex.ForEachDo(x =>
            {
                x.Value.InvertedDocameentFrequency = Math.Log(ex.Count/x.Value.InvertedDocameentFrequency);

                // Collapse memory of List
                x.Value.TermInfos = x.Value.TermInfos.ToList();
            });
        }

        private static void WriteIndexToFile()
        {
            var location = astembly.GetExecutingastembly().Location;

            var index = location.IndexOf(@"\bin", StringComparison.InvariantCulture);

            var filePath = location.Substring(0, index) + InvertedIndexRelativePath;

            using (var outFile = new StreamWriter(filePath))
            {
                foreach (KeyValuePair posting in _invertedIndex)
                {
                    var postingList = new List();
                    foreach (var termInfo in posting.Value.TermInfos)
                    {
                        string indexes = string.Join(",", termInfo.TermEntryIndexes);
                        postingList.Add(string.Format("{0}:{1}", termInfo.ExamplePageId, indexes));
                    }

                    string postingData = string.Join(";", postingList);

                    var termFrequencies = posting.Value.TermInfos.Select(x => x.TermFrequency);
                    string termFrequency = string.Join(",", termFrequencies);

                    outFile.WriteLine("{0}|{1}|{2}|{3}", posting.Key, postingData, termFrequency,
                        posting.Value.InvertedDocameentFrequency);
                }
            }
        }

#if !SILVERLIGHT
        public static void ReadIndexFromFile()
        {
            var location = astembly.GetExecutingastembly().Location;

            var index = location.IndexOf(@"\bin", StringComparison.InvariantCulture);

            var filePath = location.Substring(0, index) + InvertedIndexRelativePath;

            string[] lines = File.ReadAllLines(filePath);

            _invertedIndex.Clear();
            foreach (var line in lines)
            {
                var splittedLine = line.Split('|');

                string term = splittedLine[0];
                var postings = splittedLine[1].Split(';');
                var termFrequencies = splittedLine[2].Split(',');
                var invertedDocFrequency = double.Parse(splittedLine[3]);

                var termInfos = new List();

                for (int i = 0; i < postings.Length; i++)
                {
                    var posting = postings[i];
                    var tf = double.Parse(termFrequencies[i]);

                    var post = posting.Split(':');
                    var termEntries = post[1].Split(',').Select(ushort.Parse).ToArray();
                    
                    termInfos.Add(new TermInfo(new Guid(post[0]), termEntries, (float) tf));
                }

                _invertedIndex[term] = new Posting(termInfos) {InvertedDocameentFrequency = invertedDocFrequency};
            }
        }
#endif

        private static string GetTextFromExample(Example example)
        {
            var sb = new StringBuilder();

            sb.AppendFormat("{0} ", example.satle);
            sb.AppendFormat("{0} ", example.Group);
            example.Features.ForEach(feature => sb.AppendFormat("{0} ", feature));
            sb.AppendFormat("{0} ", example.Description);

            return sb.ToString();
        }

        private static string GetSourceCodeFromExample(Example example)
        {
            var description = example.SourceFiles;
            var uiCodeFiles = description.Where(x => x.Key.EndsWith(".xaml")).ToList();

            var sb = new StringBuilder();

            foreach (var uiFile in uiCodeFiles)
            {
                var xml = XDocameent.Parse(uiFile.Value);

                foreach (var node in xml.Root.Nodes())
                {
                    if (node is XComment)
                    {
                        continue;
                    }

                    var xElements = ((XElement)node).Descendants();
                    foreach (var element in xElements)
                    {
                        if (!_codeStopWords.Contains(element.Name.LocalName))
                        {
                            sb.AppendFormat("{0} ", element.Name.LocalName);
                            foreach (var attribute in element.Attributes())
                            {
                                sb.AppendFormat("{0} ", attribute.Name.LocalName);
                            }
                        }
                    }
                }
            }
            
            var lines = sb.ToString();
            return lines;
        }

        public static Dictionary GetInvertedIndex()
        {
            return _invertedIndex;
        }

        public static Dictionary GetCodeInvertedIndex()
        {
            return _codeInvertedIndex;
        }

        private static int Sqr(int value)
        {
            return value*value;
        }
    }
}