Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- *
- * Calculates overall data entropy (shannon) of a file and of each symbol used.
- * More @ http://csharpcodewhisperer.blogspot.com/2013/07/Data-Entropy.html
- *
- */
- public class DataEntropyUTF8
- {
- // Stores the number of times each symbol appears
- Dictionary<byte,int> distributionDict;
- // Stores the entropy for each character
- Dictionary<byte,double> probabilityDict;
- // Stores the last calculated entropy
- double overalEntropy;
- // Used for preventing unnecessary processing
- bool isDirty;
- // Bytes of data processed
- int dataSize;
- public int DataSampleSize
- {
- get { return dataSize; }
- private set { dataSize = value; }
- }
- public int UniqueSymbols
- {
- get { return distributionDict.Count; }
- }
- public double GetSymbolDistribution(byte symbol)
- {
- return distributionDict[symbol];
- }
- public double GetSymbolEntropy(byte symbol)
- {
- return probabilityDict[symbol];
- }
- public double GetEntropy()
- {
- // If nothing has changed, dont recalculate
- if(!isDirty) {
- return overalEntropy;
- }
- // Reset values
- overalEntropy = 0;
- probabilityDict = new Dictionary<byte,double>();
- foreach(KeyValuePair<byte,int> entry in distributionDict)
- {
- // Prabability = Freq of symbol / # symbols examined thus far
- probabilityDict.Add(entry.Key,(double)distributionDict[entry.Key] / (double)dataSize );
- }
- foreach(KeyValuePair<byte,double> entry in probabilityDict)
- {
- // Entropy = probability * Log2(1/probability)
- overalEntropy += entry.Value * Math.Log((1/entry.Value),2);
- }
- isDirty = false;
- return overalEntropy;
- }
- public void ExamineChunk(byte[] chunk)
- {
- if(chunk.Length<1 || chunk==null) {
- return;
- }
- isDirty = true;
- dataSize += chunk.Length;
- foreach(byte bite in chunk)
- {
- if(!distributionDict.ContainsKey(bite))
- {
- distributionDict.Add(bite,1);
- continue;
- }
- distributionDict[bite]++;
- }
- }
- public void ExamineChunk(string chunk)
- {
- ExamineChunk(StringToByteArray(chunk));
- }
- byte[] StringToByteArray(string inputString)
- {
- char[] c = inputString.ToCharArray();
- IEnumerable<byte> b = c.Cast<byte>();
- return b.ToArray();
- }
- void Clear()
- {
- isDirty = false;
- overalEntropy = 0;
- dataSize = 0;
- distributionDict = new Dictionary<byte, int>();
- probabilityDict = new Dictionary<byte, double>();
- }
- public DataEntropyUTF8(string fileName)
- {
- this.Clear();
- ExamineChunk( File.ReadAllBytes(fileName) );
- GetEntropy();
- }
- public DataEntropyUTF8()
- {
- this.Clear();
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement