python script to check if the downloaded file from URL is corrupted or not

I have a request to download files from URL and store in local system. Files are of different format like pdf, ppt, doc, docx, zip, jpg, iso etc. After downloading the files, is there a way to check if downloaded files are corrupted or not. Most of options suggested to use Checksum approach, but I do not have checksum details of file from URL to compare with downloaded file.

Used below piece of code to download file

response = requests.get(url_to_download_file,timeout=5)
with open(local_save_path, 'wb') as file:
   file.write(response.content)

Used below piece of code to download file

response = requests.get(url_to_download_file,timeout=5)
with open(local_save_path, 'wb') as file:
   file.write(response.content)

Share Improve this question asked Mar 24 at 6:35 user166013 1,5415 gold badges22 silver badges38 bronze badges

2 You cannot validate the content without some additional term of reference – Adon Bilivit Commented Mar 24 at 8:19

Add a comment |

2 Answers 2

Sorted by: Reset to default 2

You can only reliably validate a downloaded file when you have some additional information about its content - e.g., an MD5 hash

For example, if you want to download a Python release you could go to this page. There you will see an MD5 Sum noted alongside the download hyperlink.

Take a copy of the MD5 Sum then carry out the download programmatically as follows:

import hashlib
import requests
from pathlib import Path

URL = "https://www.python./ftp/python/3.13.2/python-3.13.2-macos11.pkg"
TARGET = Path("/Volumes/Spare") / URL.split("/")[-1]
CHUNK = 8192
HV = "9916903cff14723bdbc41e3ed5530eeb" # copied from the python. downloads page

hasher = hashlib.md5()

with requests.get(URL, stream=True) as response:
    response.raise_for_status()
    with TARGET.open("wb") as target:
        print("Downloading...", end="", flush=True)
        for chunk in response.iter_content(CHUNK):
            target.write(chunk)
            hasher.update(chunk)
    if hasher.hexdigest() == HV:
        print("Valid")
    else:
        print("Invalid/corrupt")

You can check if a downloaded file is corrupted without using checksums by attempting to open it with libraries specific to each file format. If the file opens successfully, it's likely not corrupted. It might be corrupted if it doesn't open or raises an error.

Here's an example of how you could download and check the file:

import requests
from PyPDF2 import PdfFileReader
from pptx import Presentation
from docx import Document
from PIL import Image
import zipfile

def check_pdf(file_path):
    try:
        with open(file_path, "rb") as f:
            reader = PdfFileReader(f)
            reader.getNumPages()
        return True
    except Exception as e:
        print(f"PDF error: {e}")
        return False

def check_pptx(file_path):
    try:
        Presentation(file_path)
        return True
    except Exception as e:
        print(f"PPTX error: {e}")
        return False

def check_docx(file_path):
    try:
        Document(file_path)
        return True
    except Exception as e:
        print(f"DOCX error: {e}")
        return False

def check_image(file_path):
    try:
        img = Image.open(file_path)
        img.verify()  # Verify that it's a valid image
        return True
    except Exception as e:
        print(f"Image error: {e}")
        return False

def check_zip(file_path):
    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.testzip()  # Test if the zip file is corrupted
        return True
    except Exception as e:
        print(f"ZIP error: {e}")
        return False

def check_iso(file_path):
    # ISO checking is more complex and may require mounting or external tools
    # Simplified check:
    try:
        with open(file_path, "rb") as f:
            f.read(4)  # Read a portion to see if it's accessible
        return True
    except Exception as e:
        print(f"ISO error: {e}")
        return False

def download_and_check(url, local_save_path):
    response = requests.get(url, timeout=5)
    with open(local_save_path, 'wb') as file:
        file.write(response.content)

    # Check if the file is corrupted based on file extension
    if local_save_path.endswith(".pdf"):
        return check_pdf(local_save_path)
    elif local_save_path.endswith(".pptx"):
        return check_pptx(local_save_path)
    elif local_save_path.endswith(".docx"):
        return check_docx(local_save_path)
    elif local_save_path.endswith(".jpg") or local_save_path.endswith(".jpeg"):
        return check_image(local_save_path)
    elif local_save_path.endswith(".zip"):
        return check_zip(local_save_path)
    elif local_save_path.endswith(".iso"):
        return check_iso(local_save_path)
    else:
        print(f"Unknown file type for {local_save_path}")
        return False

# Example usage:
url_to_download_file = "http://example/sample.pdf"
local_save_path = "sample.pdf"
if download_and_check(url_to_download_file, local_save_path):
    print(f"File {local_save_path} is not corrupted")
else:
    print(f"File {local_save_path} may be corrupted")

发布者：admin，转转请注明出处：http://www.yc00.com/questions/1744257565a4565473.html

python script to check if the downloaded file from URL is corrupted or not - Stack Overflow

2 Answers 2

发表回复

评论列表（0条）

联系我们

400-800-8888

python script to check if the downloaded file from URL is corrupted or not - Stack Overflow

2 Answers 2

相关推荐