Advertisement
BurningBunny

Calculating Overall Data Entropy, in C#

Jul 18th, 2013
121
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 2.57 KB | None | 0 0
  1. /*
  2.  *
  3.  * Calculates overall data entropy (shannon) of a file and of each symbol used.
  4.  *     More @ http://csharpcodewhisperer.blogspot.com/2013/07/Data-Entropy.html
  5.  *
  6.  */
  7.  
  8. public class DataEntropyUTF8
  9. {
  10.     // Stores the number of times each symbol appears
  11.     Dictionary<byte,int>    distributionDict;
  12.     // Stores the entropy for each character
  13.     Dictionary<byte,double> probabilityDict;
  14.     // Stores the last calculated entropy
  15.     double overalEntropy;
  16.     // Used for preventing unnecessary processing
  17.     bool isDirty;
  18.     // Bytes of data processed
  19.     int dataSize;
  20.    
  21.     public int DataSampleSize
  22.     {
  23.         get { return dataSize; }
  24.         private set { dataSize = value; }
  25.     }
  26.    
  27.     public int UniqueSymbols
  28.     {
  29.         get { return distributionDict.Count; }
  30.     }
  31.    
  32.     public double GetSymbolDistribution(byte symbol)
  33.     {
  34.         return distributionDict[symbol];
  35.     }
  36.    
  37.     public double GetSymbolEntropy(byte symbol)
  38.     {
  39.         return probabilityDict[symbol];
  40.     }
  41.    
  42.     public double GetEntropy()
  43.     {
  44.         // If nothing has changed, dont recalculate
  45.         if(!isDirty) {
  46.             return overalEntropy;
  47.         }
  48.         // Reset values
  49.         overalEntropy = 0;
  50.         probabilityDict = new Dictionary<byte,double>();
  51.        
  52.         foreach(KeyValuePair<byte,int> entry in distributionDict)
  53.         {
  54.             // Prabability = Freq of symbol / # symbols examined thus far
  55.             probabilityDict.Add(entry.Key,(double)distributionDict[entry.Key] / (double)dataSize );
  56.         }
  57.        
  58.         foreach(KeyValuePair<byte,double> entry in probabilityDict)
  59.         {
  60.             // Entropy = probability * Log2(1/probability)
  61.             overalEntropy += entry.Value * Math.Log((1/entry.Value),2);
  62.         }
  63.        
  64.         isDirty = false;
  65.         return overalEntropy;
  66.     }
  67.    
  68.     public void ExamineChunk(byte[] chunk)
  69.     {
  70.         if(chunk.Length<1 || chunk==null) {
  71.             return;
  72.         }
  73.        
  74.         isDirty = true;
  75.         dataSize += chunk.Length;
  76.        
  77.         foreach(byte bite in chunk)
  78.         {
  79.             if(!distributionDict.ContainsKey(bite))
  80.             {
  81.                 distributionDict.Add(bite,1);
  82.                 continue;
  83.             }
  84.             distributionDict[bite]++;
  85.         }
  86.     }
  87.    
  88.     public void ExamineChunk(string chunk)
  89.     {
  90.         ExamineChunk(StringToByteArray(chunk));
  91.     }
  92.    
  93.     byte[] StringToByteArray(string inputString)
  94.     {
  95.         char[] c = inputString.ToCharArray();
  96.         IEnumerable<byte> b = c.Cast<byte>();
  97.         return b.ToArray();
  98.     }
  99.    
  100.     void Clear()
  101.     {
  102.         isDirty = false;
  103.         overalEntropy = 0;
  104.         dataSize = 0;
  105.         distributionDict = new Dictionary<byte, int>();
  106.         probabilityDict = new Dictionary<byte, double>();
  107.     }
  108.    
  109.     public DataEntropyUTF8(string fileName)
  110.     {
  111.         this.Clear();
  112.  
  113.         ExamineChunk(  File.ReadAllBytes(fileName) );
  114.         GetEntropy();
  115.     }
  116.    
  117.     public DataEntropyUTF8()
  118.     {
  119.         this.Clear();
  120.     }
  121. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement