python - Split large text file into smaller chunks by byte size and to retain end of line

I have python code which splits a file into smaller chunks with byte size, for example filename.txt being 1GB split into 10 smaller files of 100MB - filename_001.txt, filename_002.txt etc.

However although I'm happy with the file size splitting I noticed within these smaller split files the file contents can be cut anywhere on the line, but I would like to retain the line and split at the end of the line with the byte size.

See the following code I have:

import os
import sys

def getfilesize(sfilename):
    with open(sfilename,"rb") as fr:
        fr.seek(0,2) # move to end of the file
        size=fr.tell()
        print("getfilesize: size: %s" % size)
        return fr.tell()

def splitfile(spath, sfilename, splitsize, dpath):
    # Open original file in read only mode
    if not os.path.isfile(spath + sfilename):
        print("No such file as: \"%s\"" % spath + sfilename)
        return

    filesize=getfilesize(spath + sfilename)

    fullsfilepath = spath + sfilename
    fulldfilepath = dpath + sfilename

    with open(spath + sfilename,"rb") as fr:
        counter=1
        inalfilename = fullsfilepath.split(".")
        newfilename = fulldfilepath.split(".")
        print(inalfilename)
        readlimit = 5000 #read 5kb at a time
        n_splits = filesize//splitsize
        print("splitfile: No of splits required: %s" % str(n_splits))
        for i in range(n_splits+1):
            chunks_count = int(splitsize)//int(readlimit)
            data_5kb = fr.read(readlimit) # read
            # Create split files
            print("chunks_count: %d" % chunks_count)

            with open(newfilename[0]+"_{id}.".format(id=str(counter))+newfilename[1],"ab") as fw:
                fw.seek(0)
                fw.truncate()# truncate original if present
                while data_5kb:
                    fw.write(data_5kb)
                    if chunks_count:
                        chunks_count-=1
                        data_5kb = fr.read(readlimit)
                    else: break
            counter+=1

if __name__ == "__main__":
    if len(sys.argv) < 4: print("Missing argument: Usage: filesplit.py sfilename splitsizeinkb dfilename")
    else:
        filesize = int(sys.argv[3]) * 1000 #make into kb
        spath = sys.argv[1]
        sfilename = sys.argv[2]
        dpath = sys.argv[4]
        splitfile(spath, sfilename, filesize, dpath)

Is it possible to achieve this? If so how can this be done?

I have python code which splits a file into smaller chunks with byte size, for example filename.txt being 1GB split into 10 smaller files of 100MB - filename_001.txt, filename_002.txt etc.

See the following code I have:

import os
import sys

def getfilesize(sfilename):
    with open(sfilename,"rb") as fr:
        fr.seek(0,2) # move to end of the file
        size=fr.tell()
        print("getfilesize: size: %s" % size)
        return fr.tell()

def splitfile(spath, sfilename, splitsize, dpath):
    # Open original file in read only mode
    if not os.path.isfile(spath + sfilename):
        print("No such file as: \"%s\"" % spath + sfilename)
        return

    filesize=getfilesize(spath + sfilename)

    fullsfilepath = spath + sfilename
    fulldfilepath = dpath + sfilename

    with open(spath + sfilename,"rb") as fr:
        counter=1
        inalfilename = fullsfilepath.split(".")
        newfilename = fulldfilepath.split(".")
        print(inalfilename)
        readlimit = 5000 #read 5kb at a time
        n_splits = filesize//splitsize
        print("splitfile: No of splits required: %s" % str(n_splits))
        for i in range(n_splits+1):
            chunks_count = int(splitsize)//int(readlimit)
            data_5kb = fr.read(readlimit) # read
            # Create split files
            print("chunks_count: %d" % chunks_count)

            with open(newfilename[0]+"_{id}.".format(id=str(counter))+newfilename[1],"ab") as fw:
                fw.seek(0)
                fw.truncate()# truncate original if present
                while data_5kb:
                    fw.write(data_5kb)
                    if chunks_count:
                        chunks_count-=1
                        data_5kb = fr.read(readlimit)
                    else: break
            counter+=1

if __name__ == "__main__":
    if len(sys.argv) < 4: print("Missing argument: Usage: filesplit.py sfilename splitsizeinkb dfilename")
    else:
        filesize = int(sys.argv[3]) * 1000 #make into kb
        spath = sys.argv[1]
        sfilename = sys.argv[2]
        dpath = sys.argv[4]
        splitfile(spath, sfilename, filesize, dpath)

Is it possible to achieve this? If so how can this be done?

Share Improve this question edited Nov 21, 2024 at 20:01 asked Nov 21, 2024 at 2:15 rob 1731 silver badge11 bronze badges

How about splitting on number of lines having the overall count. Also, readlimit could be a multiple of 4096 and big enough to reduce the number of reads 1024*1024*1024*1024/4096 = 268435456. ~260M reads with readlimi 4096 – LMC Commented Nov 21, 2024 at 2:47
Please use the internationally agreed SI units. For gigabytes, use GB because gb is not defined and Gb is gigabits. Likewise please use MB for megabytes, since your mb would read as millibits. – Mark Setchell Commented Nov 21, 2024 at 7:31
What do you expect to happen when you retain a line at the end of a chunk? Do you leave out the incomplete line so that the size of the chunk does not exceed the maximum chunk size? Or do you want the rest of the line to be included even though it would make the chunk size larger than the maximum chunk size? – blhsing Commented Nov 21, 2024 at 9:25
GNU Parallel will do that for you without writing any Python, e.g. parallel --pipe --block 100m wc :::: YOURFILE – Mark Setchell Commented Nov 21, 2024 at 10:16
@blhsing either way is fine - I understand that the split files would not all be exact same size but close to the specified chunk size – rob Commented Nov 21, 2024 at 20:04

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

Maybe you should get max lines size first,then you would know how may you should set the size of chunk.Here are the codes that may help:

import os
import sys
from pathlib import Path

def lines_max_size(filepath: str):
    """
    iter file content by size and return max fize of them

    Args:
        filepath (str): path of file to iter
    """
    ans = 0
    with open(filepath, "r") as f:
        for line in f:
            size = sys.getsizeof(line)
            if size > ans:
                ans = size
    return ans


def split_file(filepath: str, chunk_size: int, outfile_prefix):
    """
    split large file to smaller files by lines, whiches size will lower than chunk_size

    Args:
         filepath (str): path of file to split
         chunk_size (str): size the files after spliting should be lower than, unit - bytes
         outfile_prefix (str): name of output files should be in the format - outfile_prefix+count_number
    """
    content = ""
    count = 1
    save_path = Path(filepath).parent
    with open(filepath, "r") as f:
        for line in f:
            line_size = sys.getsizeof(line)
            if line_size > chunk_size:
                raise Exception("current line size is %s, chunk_size - %s too small to splite file by lines" % (
                    line_size,
                    chunk_size
                ))

            if sys.getsizeof(content+line) <= chunk_size:
                content += line
            else:
                fn = outfile_prefix + str(count) + filepath.split(".")[-1]
                fp = save_path.joinpath(fn)
                with open(fp, "w") as fw:
                    fw.write(content)
                content = line
                count += 1
    if content:
        fn = outfile_prefix + str(count) + filepath.split(".")[-1]
        fp = save_path.joinpath(fn)
        with open(fp, "w") as fw:
            fw.write(content)


def list_files_content_size(path: Path, file_prefix: str):
    """
    under path, list size of file's content, those files' name starts with file_prefix

    Args:
        path (Path): files under this path to list
        file_prefix (str): files name should start with 
"""
    for fn in os.listdir(path):
        if not fn.startswith(file_prefix):
            continue
        fp = path.joinpath(fn)
        with open(fp, "r") as f:
            size = sys.getsizeof(f.read())
            print(fn, str(size) + "bytes")


if __name__ == "__main__":
    filepath = "a.txt"
    #Firstly,get max fize of lines 
    print("max size of lines is: ", lines_max_size(filepath))

    # Then, split large file
    split_file(filepath="a.txt", chunk_size=240, outfile_prefix="output")

    # list out files's size
    current_path = Path(__file__).parent
    list_files_content_size(current_path, "output")

发布者：admin，转转请注明出处：http://www.yc00.com/questions/1742318270a4421281.html

python - Split large text file into smaller chunks by byte size and to retain end of line - Stack Overflow

1 Answer 1

发表回复

评论列表（0条）

联系我们

400-800-8888

python - Split large text file into smaller chunks by byte size and to retain end of line - Stack Overflow

1 Answer 1

相关推荐