Faster file IO in python using cython

Reading large files in Python sometimes feels incredible slow. Here are some approaches using Cython to minimize reading times. Simply compiling the existing python code with Cython reduces the reading times by 23%. By introducing explicit type definitions, I could finally reach C++ reading speeds wich are 4.4x faster than pure Python code. However, when I used the generator keyword yield to iterate over all lines in an external Python function without exploiting my memory, the required runtime doubles for this approach. The used codeĀ snippets are listed below.

timings

File: file_io_python.py Simple python function to read in a file line by line.

def read_file_python(filename):
    f = open(filename, "rb")
    while True:
        line = f.readline()
        if not line: break
 
        #yield line
 
    f.close()
    return []

File: file_io.pyx Cython file, containing a pure python function and a cython optimized function for linewise file reading.

from libc.stdio cimport *
 
cdef extern from "stdio.h":
    #FILE * fopen ( const char * filename, const char * mode )
    FILE *fopen(const char *, const char *)
    #int fclose ( FILE * stream )
    int fclose(FILE *)
    #ssize_t getline(char **lineptr, size_t *n, FILE *stream);
    ssize_t getline(char **, size_t *, FILE *)
 
def read_file_slow(filename):
    f = open(filename, "rb")
    while True:
        line = f.readline()
        if not line: break
 
        #yield line
 
    f.close()
 
    return []
 
def read_file(filename):
    filename_byte_string = filename.encode("UTF-8")
    cdef char* fname = filename_byte_string
 
    cdef FILE* cfile
    cfile = fopen(fname, "rb")
    if cfile == NULL:
        raise FileNotFoundError(2, "No such file or directory: '%s'" % filename)
 
    cdef char * line = NULL
    cdef size_t l = 0
    cdef ssize_t read
 
    while True:
        read = getline(&line, &l, cfile)
        if read == -1: break
 
        #yield line
 
    fclose(cfile)
 
    return []

File: file_io.cppComparison code for C++.

#include "stdio.h"
#include <stdlib.h>
 
int main()
{
    FILE* cfile = fopen("trajectory.pdb", "rb");
 
    if(cfile == NULL) return 1;
 
    char * line = NULL;
    size_t l = 0;
    ssize_t read;
 
    while(true)
    {
        read = getline(&line, &l, cfile);
        if(read == -1) break;
    }
    free(line);
 
    fclose(cfile);
 
    return 0;
}

File: file_io_bench.pyPython code to test and benchmark all different functions.
import timeit
 
#################
count = 10
check = False
#################
 
if check:
    from file_io import read_file, read_file_slow
 
    import hashlib
    m = hashlib.new("md5")
    for line in read_file_slow("trajectory.pdb"):
        m.update(line)
 
    h1 = m.hexdigest()
 
    m = hashlib.new("md5")
    for line in read_file("trajectory.pdb"):
        m.update(line)
 
    h2 = m.hexdigest()
 
    assert h1 == h2, Exception("read error")
    print("read functions: ok")
 
t = timeit.Timer("""for line in read_file_python("trajectory.pdb"):
  pass""", """from file_io_python import read_file_python""")
t1 = t.timeit(count)
print("Python", t1, "sec")
 
t = timeit.Timer("""for line in read_file_slow("trajectory.pdb"):
  pass""", """from file_io import read_file_slow""")
t2 = t.timeit(count)
print("Cython", t2, "sec")
 
t = timeit.Timer("""for line in read_file("trajectory.pdb"):
  pass""", """from file_io import read_file""")
t3 = t.timeit(count)
print("cdef Cython", t3, "sec")
 
t = timeit.Timer("""s = subprocess.Popen("./a.out", shell=True)
s.wait()
""", """import subprocess""")
t4 = t.timeit(count)
print("C", t4, "sec")

VN:F [1.9.22_1171]
Rating: 6.6/10 (5 votes cast)
Faster file IO in python using cython, 6.6 out of 10 based on 5 ratings

Leave a Comment


NOTE - You can use these HTML tags and attributes:
<a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong>

*