Jonas' blog

IT security & forensics

RSSTwitterGithub

strings benchmark of Go, Java, Python and Rust

A couple of days after last month’s blog post about sha1sum benchmarks some researchers showed, that SHA-1 can be broken in practice. However this does not invalidate the test results.

As another test I decided to code a simple strings clone in some programming languages and benchmark those.

For each file given, gnu strings prints the printable character sequences that are at least 4 characters long […] and are followed by an unprintable character. – GNU Binary Utilities documentation

I ran the code on my mid 2011 MacBook Air on a SSD drive. The following table lists the runtime of the code. As an input file I used a random 100 MB file.

macOS
2 Core 1,7 GHz
4 GB RAM
Go 1.8 1.9 s
Java 8 121 16.3 s
Python 3.6.0 59.8 s
Rust 1.16.0* 5.0 s

* Compiled with -O flag

And again, these results are not really scientifically sound, but show the by far best performance for Go. Python really lacks behind this time, while it was quite fast for the processing of sha1 values.

These results let myself to the decision to use Go for my forensic projects, to be able to cope with huge amounts of data in an appropriate time.

You can send me feedback on this post via Twitter.

Code

Go strings

    package main

    import (
        "bufio"
        "bytes"
        "os"
    )

    func main() {
        file, _ := os.Open(os.Args[1])
        defer file.Close()

        buffer := make([]byte, 4096*4096)
        var currentString bytes.Buffer
        f := bufio.NewWriter(os.Stdout)
        defer f.Flush()
        for {
            size, _ := file.Read(buffer)
            if size == 0 {
                break
            }
            Strings(buffer[:size], &currentString, f)
        }

        if currentString.Len() >= 4 {
            currentString.WriteTo(f)
            f.WriteByte('\n')
        }
    }

    func Strings(data []byte, currentString *bytes.Buffer, f *bufio.Writer) {
        for _, c := range data {
            if (c >= ' ' && c <= '~') || c == '' {
                currentString.WriteByte(c)
            } else {
                if currentString.Len() >= 4 {
                    currentString.WriteTo(f)
                    f.WriteByte('\n')
                } else if currentString.Len() > 0 {
                    currentString.Reset()
                }
            }
        }
    }

Python strings

    import sys
    import itertools

    def strings(data, current_string):
        for i, c in enumerate(data):
            if (0x20 <= c <= 0x7e) or c == 0x0c:
                current_string.append(c)
            else:
                if len(current_string) >= 4:
                    print(current_string.decode())
                current_string.clear()
        return current_string

    if __name__ == '__main__':
        with open(sys.argv[1], 'rb') as io:
            current_string = bytearray()
            for block in itertools.count():
                data = io.read(4096 * 4096)
                if not data:
                    break
                current_string = strings(data, current_string)
            if len(current_string) >= 4:
                print(current_string.decode())

Java strings

    import java.io.BufferedReader;
    import java.lang.StringBuilder;
    import java.io.FileReader;
    import java.util.Arrays;

    public class Strings {

        public static void main(String[] args) throws Exception {
            BufferedReader br = new BufferedReader(new FileReader(args[0]));

            char[] buffer = new char[4096 * 4096];
            StringBuilder currentString = new StringBuilder();
            int size;
            while ((size = br.read(buffer)) != -1) {
                currentString = strings(buffer, size, currentString);
            }
            if (currentString.length() >= 4) {
                System.out.println(currentString);
            }

            br.close();
        }

        public static StringBuilder strings(char[] data, int size, StringBuilder currentString) {
            for(int i = 0; i < size; i++) {
                char c = data[i];
                if ((0x20 <= c && c <= 0x7e) || c == 0x0c) {
                    currentString.append(c);
                } else {
                    if (currentString.length() >= 4) {
                        System.out.println(currentString);
                    }
                    currentString.setLength(0); // reset
                }
            }
            return currentString;
        }
    }

Rust strings

    use std::fs::File;
    use std::env;
    use std::io::{self, Read, Write};

    fn main() {
        let path = &env::args().nth(1).unwrap();
        let mut f=File::open(path).unwrap();

        let mut buf = vec![0; 4096 * 4096];
        let mut current_string: Vec<u8> = Vec::new();

        let mut stdout = io::stdout();

        loop {
            let size = match f.read(&mut buf) {
                Err(error) => panic!("... {}", error),
                Ok(0) => break,
                Ok(result) => result,
            };

            strings(&buf[0..size], &mut current_string, &mut stdout);
        }
        if current_string.len() >= 4 {
            stdout.write_all(&current_string[..]).unwrap();
            stdout.write(b"\n").unwrap();
        }
        stdout.flush().unwrap();
    }

    fn strings(data: &[u8], current_string: &mut Vec<u8>, stdout: &mut io::Write) {
        for c in data.iter() {
            if *c >= 0x20 && *c <= 0x7e || *c == 0x0c {
                current_string.push(*c);
            } else {
                if current_string.len() >= 4 {
                    stdout.write_all(&current_string[..]).unwrap();
                    stdout.write(b"\n").unwrap();
                } 
                current_string.clear()
            }
        }
    }