Source code for dynabench.dataset._download

import requests
import shutil
import glob
import os
import hashlib
from tqdm.auto import tqdm


DATASETS_WUEDATA = {
    'advection-cloud-high': 'zbqcxuyReznAgcph',
    'advection-cloud-low': 'GaRdhfPCWxHUzxlQ',
    'advection-cloud-medium': 'owRAjEeBzNIGqvdK',
    'advection-grid-full': 'yykBSNdbcqItyWDh',
    'advection-grid-high': 'nwJlWdrofVkYuCts',
    'advection-grid-low': 'FGItmswlpuHempEw',
    'advection-grid-medium': 'zmciLlnTwTlPVZCz',
    'burgers-cloud-high': 'aGrDKZabZGEXKZYO',
    'burgers-cloud-low': 'wpOlFjtZEGmwiavw',
    'burgers-cloud-medium': 'rDBEKoNFokVeAeRc',
    'burgers-grid-full': 'PDOojyfucyMRRLXu',
    'burgers-grid-high': 'oWQFcnHDPFVTPuFF',
    'burgers-grid-low': 'llIjxBIGcADVYxqP',
    'burgers-grid-medium': 'uINfiLobMAwkqczG',
    'gasdynamics-cloud-high': 'pxXsRMxabNQcHuLc',
    'gasdynamics-cloud-low': 'DZkQFcCKGEKwWQvQ',
    'gasdynamics-cloud-medium': 'oXcHGHypRtrfsFLG',
    'gasdynamics-grid-full': 'PmXYFqMThqzQUHDr',
    'gasdynamics-grid-high': 'mCTBjrtqJUcavlXF',
    'gasdynamics-grid-low': 'zKFoiADdlABeAfkG',
    'gasdynamics-grid-medium': 'ZqNhoHNmhQTPdEwR',
    'kuramotosivashinsky-cloud-high': 'qCgdghMRDesLMCVA',
    'kuramotosivashinsky-cloud-low': 'ApqXiYzrJeKUlrwV',
    'kuramotosivashinsky-cloud-medium': 'phAeHeEJqUgtHGYS',
    'kuramotosivashinsky-grid-full': 'MZwiHmAEZMpDbNTt',
    'kuramotosivashinsky-grid-high': 'ELRRJbfPWZDzZJeS',
    'kuramotosivashinsky-grid-low': 'OAqFMPiCplVdZfPI',
    'kuramotosivashinsky-grid-medium': 'CuyNqLlxtHxXbRss',
    'reactiondiffusion-cloud-high': 'QWuksjgCrrAtLvUf',
    'reactiondiffusion-cloud-low': 'wnJBGyCRApQVYNej',
    'reactiondiffusion-cloud-medium': 'riXsWXMdJAVrEiiw',
    'reactiondiffusion-grid-full': 'eBpfBneUjyylNipB',
    'reactiondiffusion-grid-high': 'IyISptaSPhILTNKG',
    'reactiondiffusion-grid-low': 'IfnaSfmyjxkwHDJs',
    'reactiondiffusion-grid-medium': 'mtBwUKUzUhiQrLLr',
    'wave-cloud-high': 'FNrHjaYEfEXzqvDI',
    'wave-cloud-low': 'YQwODtvCBYffajww',
    'wave-cloud-medium': 'tNMywrnmQlUatFra',
    'wave-grid-full': 'OVNOzAPatjIHbmmD',
    'wave-grid-high': 'eVzNMGmGnuFYMSRM',
    'wave-grid-low': 'IaBtcJtpAriSciCd',
    'wave-grid-medium': 'TnRqCkHNoseAJemZ'
}

BASE_URL = "https://wuedata.uni-wuerzburg.de/radar/api/datasets/%s/download"




def download_raw(equation: str, structure: str, resolution: str, tmp_dir: str = "data/tmp/"):
    """
        Download the raw tar-file from the WUEData API.

        Parameters
        ----------
        equation : str
            Name of the equation to download.
        structure : str
            Description of how the observation points are structured. Can be "cloud" or "grid".
        resolution : str
            Resolution of the dataset. Can be "low", "medium", or "high".
        tmp_dir : str
            Directory where the temporary files should be saved. Defaults to "data/tmp/".
        
            
        Returns
        -------
        None
    """


    # make an HTTP request within a context manager
    url = BASE_URL % DATASETS_WUEDATA[f"{equation}-{structure}-{resolution}"]
    with requests.get(url, stream=True) as r:
        
        # check header to get content length, in bytes
        total_length = int(r.headers.get("Content-Length"))
        
        # implement progress bar via tqdm
        with tqdm.wrapattr(r.raw, "read", total=total_length, desc="") as raw:
        
            # save the output to a file
            out_path = os.path.join(tmp_dir, f"{equation}-{structure}-{resolution}.tar")
            with open(out_path, 'wb') as output:
                shutil.copyfileobj(raw, output)
            

            #shutil.rmtree("data/tmp")

[docs] def download_equation(equation: str, structure: str, resolution: str, data_dir: str = "data", tmp_dir: str = "tmp"): """ Download a dataset and unpack it to the right place. Parameters ---------- equation : str Name of the equation to download. structure : str Description of how the observation points are structured. Can be "cloud" or "grid". resolution : str Resolution of the dataset. Can be "low", "medium", or "high". data_dir : str Directory where the dataset should be saved. Defaults to "data/". tmp_dir : str Directory where the temporary files should be saved. Defaults to "data/tmp/". This directory will be deleted after the dataset is unpacked. Returns ------- None """ # paths tmp_dir = os.path.join(data_dir, tmp_dir) os.makedirs(tmp_dir, exist_ok=True) # download the tar file print("Downloading data...") download_raw(equation, structure, resolution, tmp_dir) # unpack the tar file print("Unpacking data...") tar_path = os.path.join(tmp_dir, f"{equation}-{structure}-{resolution}.tar") tmp_target_path = os.path.join(tmp_dir, f"{equation}-{structure}-{resolution}") shutil.unpack_archive(tar_path, tmp_target_path) # check md5 sums print("Checking md5 sums...") manifest_template = os.path.join(tmp_target_path, "*", "manifest-md5.txt") manifest_path = glob.glob(manifest_template)[0] with open(manifest_path, 'r') as f: manifest = f.read() file_template = os.path.join(tmp_target_path, "*", "data", "dataset", "*.h5") for file in glob.glob(file_template): with open(file, 'rb') as f: file_hash = hashlib.md5(f.read()).hexdigest() if file_hash not in manifest: print(f"MD5 sum of {file} does not match the manifest.") # move the data to the right place print("Moving data...") target_path = os.path.join(data_dir, equation, structure, resolution) os.makedirs(target_path, exist_ok=True) for file in glob.glob(file_template): try: shutil.move(file, target_path) except shutil.Error: print(f"File {file} cannot be moved to {target_path}. Skipping.") # clean up print("Cleaning up...") shutil.rmtree(tmp_dir)
if __name__ == "__main__": download_equation("wave", "cloud", "low")