💾 Archived View for 0x80.org › gemlog › 2014-07-27-entropy-analysis.gmi captured on 2022-04-28 at 17:41:12. Gemini links have been rewritten to link to archived content
⬅️ Previous capture (2021-12-03)
-=-=-=-=-=-=-
I wanted a tool that does entropy analysis using Shannon entropy, so I wrote one. Shannon is a measure of uncertanity. It is denoted in math as the greek letter Eta, it has an expected value of E[I(X)] where I(X) aka information content is -ln(P(X)) when working on a finite sample of {x1,x2,x3,x4...xi} we can calculate H(X) by
This helps me to find random blocks/chunks in binaries or data. To determine packed data, encrypted blocks, interesting things. The values of the entropy goes from 0 to 8 where 0 (small entropy) is very not random ^_^ and 8 (large entropy) is very random. I wrote the code in Go[1] lang.
package main import ( "os" "io/ioutil" "fmt" "math" "strconv" "code.google.com/p/plotinum/plot" "code.google.com/p/plotinum/plotter" "code.google.com/p/plotinum/plotutil" ) type entropy_blocks struct { entropy float64; block []byte; off_from int; off_to int; } func main() { var block_size int; var suspecious_entropy float64; var filename string; var output string; if len(os.Args) < 2 { fmt.Printf("Usage : %s <filename> <block_size> <suspecious_entropy> <output>\n", os.Args[0]) return } filename = os.Args[1]; if len(os.Args) > 2 { temp, err := strconv.Atoi(os.Args[2]) if err != nil || temp <= 0 { fmt.Println("Invalid block_size"); return; } block_size = temp } else { block_size = 32 } if len(os.Args) > 3 { temp, err := strconv.ParseFloat(os.Args[3], 64) if err != nil || temp > 8 { fmt.Println("Invalid suspecious entropy") return } suspecious_entropy = temp } else { suspecious_entropy = 5.0 } if len(os.Args) > 4 { output = os.Args[4] } else { output = "point.png" } fmt.Printf("[+] Filename %s, block size : %d, suspecious entropy %g, output %s\n", filename, block_size, suspecious_entropy, output) brange := make([]byte, 256); for i := range brange { brange[i] = byte(i); } data, err := ioutil.ReadFile(filename); if err != nil { fmt.Println(err); return; } fmt.Printf("[*] Total Entropy %g\n", H(data,brange)); ent_blocks := generate_entropy_blocks(data, block_size, brange) fmt.Println("[+] Graphing..") if err := graph(ent_blocks, suspecious_entropy, output); err != true { fmt.Println("[-] Unble to graph") } n_suspecious := 0 for i := range(ent_blocks) { if ent_blocks[i].entropy >= suspecious_entropy { n_suspecious++; } } if n_suspecious > 0 { fmt.Printf("[*] Suspecious blocks : %d\n", n_suspecious); for i := range(ent_blocks) { if ent_blocks[i].entropy >= suspecious_entropy { fmt.Printf("[0x%.8x,0x%.8x] -> %g\n", ent_blocks[i].off_from, ent_blocks[i].off_to, ent_blocks[i].entropy) } } } else { fmt.Println("No suspecious blocks\n") } } func graph(ent_blocks []entropy_blocks, suspecious_entropy float64, output string) bool { p, err := plot.New() if err != nil { return false } p.Title.Text = "Entropy" p.X.Label.Text = "offsets" p.Y.Label.Text = "Entropy" err = plotutil.AddLinePoints(p, "line", graph_xy_entropy(ent_blocks), "suspecious", graph_xy_suspecious(suspecious_entropy, ent_blocks)) if err != nil { return false } if err := p.Save(20, 20, output); err != nil { return false } return true } func graph_xy_suspecious(suspecious_entropy float64, ent_block []entropy_blocks) plotter.XYs { pts := make(plotter.XYs, len(ent_block)) for i := range pts { pts[i].X = float64(ent_block[i].off_from) pts[i].Y = suspecious_entropy } return pts } func graph_xy_entropy(ent_blocks []entropy_blocks) plotter.XYs { pts := make(plotter.XYs, len(ent_blocks)) for i := range pts { pts[i].X = float64(ent_blocks[i].off_from) pts[i].Y = float64(ent_blocks[i].entropy) } return pts } func generate_entropy_blocks(data []uint8, block_size int, brange []byte) []entropy_blocks { n_blocks := len(data)/(block_size*2)+1; ent_blocks := make([]entropy_blocks, n_blocks) cur_block := 0 from := 0 overflow := 0 for from=0;from<len(data);from+=block_size { if from+block_size > len(data) { overflow = ((from+block_size) % len(data)) } to := from+block_size-overflow; ent_blocks[cur_block].block = data[from:to] ent_blocks[cur_block].off_from = from ent_blocks[cur_block].off_to = to ent_blocks[cur_block].entropy = H(ent_blocks[cur_block].block, brange) from += block_size cur_block++; if from >= len(data) || cur_block >= (len(data)/(block_size*2))+1 { break } } return ent_blocks; } func H(data []uint8, brange []byte) float64 { var p_x float64; var entropy = float64(0); for i := range(brange) { p_x = float64(CountBytes(brange[i], Uint8ToBytes(data))) / float64(len(data)) if p_x > float64(0) { entropy += - p_x * math.Log2(p_x) } } return entropy; } func CountBytes(needle byte, haystack []byte) int { var count = 0; for i := range(haystack) { if needle == haystack[i] { count++; } } return count; } func Uint8ToBytes(a []uint8) []byte { b := make([]byte, len(a)); for i := range(a) { b[i] = byte(a[i]); } return b; }
For example. I have two binaries one that is unpacked and one that is packed. Let us see how their entropy graphs looks like.
ze%us:entropy_graph/ (master✗) # ./entropy_graph bin.unpacked 32 6.5 bin.unpacked.png 2>&1 > /dev/null ze%us:entropy_graph/ (master✗) # ./entropy_graph bin.packed 32 6.5 bin.packed.png 2>&1 > /dev/null
This will generate two graphs of 32 blocks with suspecious line at entropy 6.5 to determine from the graph anything that goes above it so we know where to look in the binary. It will also list the suspecious blocks of entropy 6.5 and above in stdout. bin.unpacked has an entropy of 5.909006 while bin.packed has an entropy of 7.87337 this tells us which one is packed and which one isn't. Looking at the graphs makes it more clear.
looking at the graph of bin.unpacked we can see that it's a normal elf file that starts with very small entropy then it goes up and stays at around 4 and with no spikes or anything irregular, but for bin.packed we see that it has a high entropy in general except at the very beginning where it contains the "unpacking code" the rest is just packed code with a specific pattern. The binary was packed with UPX. This information can be useful while working on firmwares, unknown data formats, or generally to determine random or packed code inside a binary or to find cryptography keys that have large entropy ..etc