I have python code which splits a file into smaller chunks with byte size, for example filename.txt being 1GB split into 10 smaller files of 100MB - filename_001.txt, filename_002.txt etc.
However although I'm happy with the file size splitting I noticed within these smaller split files the file contents can be cut anywhere on the line, but I would like to retain the line and split at the end of the line with the byte size.
See the following code I have:
import os
import sys
def getfilesize(sfilename):
with open(sfilename,"rb") as fr:
fr.seek(0,2) # move to end of the file
size=fr.tell()
print("getfilesize: size: %s" % size)
return fr.tell()
def splitfile(spath, sfilename, splitsize, dpath):
# Open original file in read only mode
if not os.path.isfile(spath + sfilename):
print("No such file as: \"%s\"" % spath + sfilename)
return
filesize=getfilesize(spath + sfilename)
fullsfilepath = spath + sfilename
fulldfilepath = dpath + sfilename
with open(spath + sfilename,"rb") as fr:
counter=1
inalfilename = fullsfilepath.split(".")
newfilename = fulldfilepath.split(".")
print(inalfilename)
readlimit = 5000 #read 5kb at a time
n_splits = filesize//splitsize
print("splitfile: No of splits required: %s" % str(n_splits))
for i in range(n_splits+1):
chunks_count = int(splitsize)//int(readlimit)
data_5kb = fr.read(readlimit) # read
# Create split files
print("chunks_count: %d" % chunks_count)
with open(newfilename[0]+"_{id}.".format(id=str(counter))+newfilename[1],"ab") as fw:
fw.seek(0)
fw.truncate()# truncate original if present
while data_5kb:
fw.write(data_5kb)
if chunks_count:
chunks_count-=1
data_5kb = fr.read(readlimit)
else: break
counter+=1
if __name__ == "__main__":
if len(sys.argv) < 4: print("Missing argument: Usage: filesplit.py sfilename splitsizeinkb dfilename")
else:
filesize = int(sys.argv[3]) * 1000 #make into kb
spath = sys.argv[1]
sfilename = sys.argv[2]
dpath = sys.argv[4]
splitfile(spath, sfilename, filesize, dpath)
Is it possible to achieve this? If so how can this be done?
I have python code which splits a file into smaller chunks with byte size, for example filename.txt being 1GB split into 10 smaller files of 100MB - filename_001.txt, filename_002.txt etc.
However although I'm happy with the file size splitting I noticed within these smaller split files the file contents can be cut anywhere on the line, but I would like to retain the line and split at the end of the line with the byte size.
See the following code I have:
import os
import sys
def getfilesize(sfilename):
with open(sfilename,"rb") as fr:
fr.seek(0,2) # move to end of the file
size=fr.tell()
print("getfilesize: size: %s" % size)
return fr.tell()
def splitfile(spath, sfilename, splitsize, dpath):
# Open original file in read only mode
if not os.path.isfile(spath + sfilename):
print("No such file as: \"%s\"" % spath + sfilename)
return
filesize=getfilesize(spath + sfilename)
fullsfilepath = spath + sfilename
fulldfilepath = dpath + sfilename
with open(spath + sfilename,"rb") as fr:
counter=1
inalfilename = fullsfilepath.split(".")
newfilename = fulldfilepath.split(".")
print(inalfilename)
readlimit = 5000 #read 5kb at a time
n_splits = filesize//splitsize
print("splitfile: No of splits required: %s" % str(n_splits))
for i in range(n_splits+1):
chunks_count = int(splitsize)//int(readlimit)
data_5kb = fr.read(readlimit) # read
# Create split files
print("chunks_count: %d" % chunks_count)
with open(newfilename[0]+"_{id}.".format(id=str(counter))+newfilename[1],"ab") as fw:
fw.seek(0)
fw.truncate()# truncate original if present
while data_5kb:
fw.write(data_5kb)
if chunks_count:
chunks_count-=1
data_5kb = fr.read(readlimit)
else: break
counter+=1
if __name__ == "__main__":
if len(sys.argv) < 4: print("Missing argument: Usage: filesplit.py sfilename splitsizeinkb dfilename")
else:
filesize = int(sys.argv[3]) * 1000 #make into kb
spath = sys.argv[1]
sfilename = sys.argv[2]
dpath = sys.argv[4]
splitfile(spath, sfilename, filesize, dpath)
Is it possible to achieve this? If so how can this be done?
Share Improve this question edited Nov 21, 2024 at 20:01 rob asked Nov 21, 2024 at 2:15 robrob 1731 silver badge11 bronze badges 5 |1 Answer
Reset to default 0Maybe you should get max lines size first,then you would know how may you should set the size of chunk.Here are the codes that may help:
import os
import sys
from pathlib import Path
def lines_max_size(filepath: str):
"""
iter file content by size and return max fize of them
Args:
filepath (str): path of file to iter
"""
ans = 0
with open(filepath, "r") as f:
for line in f:
size = sys.getsizeof(line)
if size > ans:
ans = size
return ans
def split_file(filepath: str, chunk_size: int, outfile_prefix):
"""
split large file to smaller files by lines, whiches size will lower than chunk_size
Args:
filepath (str): path of file to split
chunk_size (str): size the files after spliting should be lower than, unit - bytes
outfile_prefix (str): name of output files should be in the format - outfile_prefix+count_number
"""
content = ""
count = 1
save_path = Path(filepath).parent
with open(filepath, "r") as f:
for line in f:
line_size = sys.getsizeof(line)
if line_size > chunk_size:
raise Exception("current line size is %s, chunk_size - %s too small to splite file by lines" % (
line_size,
chunk_size
))
if sys.getsizeof(content+line) <= chunk_size:
content += line
else:
fn = outfile_prefix + str(count) + filepath.split(".")[-1]
fp = save_path.joinpath(fn)
with open(fp, "w") as fw:
fw.write(content)
content = line
count += 1
if content:
fn = outfile_prefix + str(count) + filepath.split(".")[-1]
fp = save_path.joinpath(fn)
with open(fp, "w") as fw:
fw.write(content)
def list_files_content_size(path: Path, file_prefix: str):
"""
under path, list size of file's content, those files' name starts with file_prefix
Args:
path (Path): files under this path to list
file_prefix (str): files name should start with
"""
for fn in os.listdir(path):
if not fn.startswith(file_prefix):
continue
fp = path.joinpath(fn)
with open(fp, "r") as f:
size = sys.getsizeof(f.read())
print(fn, str(size) + "bytes")
if __name__ == "__main__":
filepath = "a.txt"
#Firstly,get max fize of lines
print("max size of lines is: ", lines_max_size(filepath))
# Then, split large file
split_file(filepath="a.txt", chunk_size=240, outfile_prefix="output")
# list out files's size
current_path = Path(__file__).parent
list_files_content_size(current_path, "output")
发布者:admin,转转请注明出处:http://www.yc00.com/questions/1742318270a4421281.html
1024*1024*1024*1024/4096 = 268435456
. ~260M reads with readlimi 4096 – LMC Commented Nov 21, 2024 at 2:47GB
becausegb
is not defined andGb
is gigabits. Likewise please useMB
for megabytes, since yourmb
would read as millibits. – Mark Setchell Commented Nov 21, 2024 at 7:31parallel --pipe --block 100m wc :::: YOURFILE
– Mark Setchell Commented Nov 21, 2024 at 10:16