Huggingface模型和数据集批量下载

本地跑AI应用的时候，模型动不动就是十几个G，总是因为下载模型的问题，卡住时间，Huggingface是国外的网站，国内访问难免会不稳定，用git lfs clone下载大文件的时候，总是会断掉，非常不方便，请求下载，requests获取到所有的下载链接，模拟下载，并且创建不同的文件夹用。此程序可以帮你稳定的下载Huggingface上的数据集和模型。来存放模型，下载有进度条和断点续下载。

qq_45437316

1172人浏览 · 2023-12-06 18:57:02

qq_45437316 · 2023-12-06 18:57:02 发布

Huggingface模型和数据集批量下载

原因：

本地跑AI应用的时候，模型动不动就是十几个G，总是因为下载模型的问题，卡住时间，Huggingface是国外的网站，国内访问难免会不稳定，用git lfs clone下载大文件的时候，总是会断掉，非常不方便，
此程序可以帮你稳定的下载Huggingface上的数据集和模型

流程：

请求下载，requests获取到所有的下载链接，模拟下载，并且创建不同的文件夹用
来存放模型，下载有进度条和断点续下载

代码：

import os
import requests
from lxml import etree
import time
import re
import urllib.request
from tqdm import tqdm


# 开始
def start_run(url, head, root_dir_name):
    # 状态码
    respone = requests.get(url , headers=head)
    print(respone)
    #状态码是否正常
    if respone.status_code >= 200 or respone.status_code < 300:
        html = etree.HTML(respone.text)
        model_dir_list = html.xpath('/html/body/div[1]/main/div[2]/section/div[3]/ul/li')
        
        for model_dir in model_dir_list:
            test = len(model_dir.xpath('div[1]/@class')[0])
            # 文件
            if test>24:
                model_down_url = model_dir.xpath('a[1]/@href')[0]
                model_url = 'https://huggingface.co'+model_down_url
                model_name = model_down_url.split('/')[-1].split('?')[0]
                down_url(model_url, model_name, dir_name=root_dir_name)
            # 文件夹
            else:
                dir_name = model_dir.xpath('a[1]/span/text()')[0]
                dir_name = root_dir_name+'/'+dir_name
                # 判断文件夹是否存在
                if not os.path.exists(dir_name):
                    # 如果不存在，创建文件夹
                    os.makedirs(dir_name)
                    print(f"子目录文件夹 {dir_name} 创建成功")
                else:
                    print(f"子目录文件夹 {dir_name} 已经存在")

                model_url = model_dir.xpath('a[1]/@href')[0]
                next_url = 'https://huggingface.co'+model_url
                next_rep(next_url, dir_name)


# 翻到下一页
def next_rep(next_url, dir_name):
    next_respone = requests.get(url=next_url , headers=head)

    html = etree.HTML(next_respone.text)
    
    model_down_url_list = html.xpath('/html/body/div[1]/main/div[2]/section/div[3]/ul/li')
    for value in model_down_url_list:
        model_down_url = value.xpath(f'a[1]/@href')[0]

        model_url = 'https://huggingface.co'+model_down_url
        model_name = model_down_url.split('/')[-1].split('?')[0]
        down_url(model_url, model_name, dir_name)
        

# 下载
def down_url(model_url, model_name, dir_name=''):
    response = urllib.request.urlopen(model_url)
    chunk_size = 1024 * 1024
    total_size = int(response.getheader('Content-Length').strip())


    if dir_name != '':
        model_name = dir_name+'/'+model_name
    
    if os.path.exists(model_name):
        print("已跳过！！！，文件存在")
    else:

        # 不同的下载样式
        # 1,基本样式
        # tqdm(total=100, desc="下载进度", unit='B', unit_scale=True)
        # 下载进度: 60%|███████████████           | 60MB/100MB [00:30<00:20, 2MB/s]
        #      
        # 2,改变进度块的样式
        # tqdm(total=100, ascii=True)
        # 40%|####                            | 40/100 [00:20<00:30]

        # 3，自定义 bar_format
        # tqdm(total=100, bar_format="{l_bar} {bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]")
        # 下载中... |███████████████████         | 70/100 [00:35<00:15, 2MB/s]

        # 4，自定义 bar_format
        # tqdm(total=100, bar_format="{percent:3.0f}%")
        #  75%

        # 5，详细信息
        # tqdm(total=100, bar_format="{desc}: {percentage:3.0f}%|{bar}| {n}/{total} [{rate_fmt}{postfix}]")
        # 正在下载:  90%|████████████████████████ | 90/100 [2MB/s]
        existing_file_size = 0
        if os.path.exists(model_name):
            existing_file_size = os.path.getsize(model_name)

        # 发送请求，如果需要从已下载部分继续下载
        req = urllib.request.Request(url)
        if existing_file_size:
            req.add_header('Range', f'bytes={existing_file_size}-')

        with urllib.request.urlopen(req) as response:
            # 确定总文件大小
            total_size = existing_file_size + int(response.headers.get('content-length', 0))
        with open(model_name, 'ab') as f, tqdm(
            desc=model_name,
            total=total_size,
            initial=existing_file_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            chunk_size = 1024 * 1024  # 1MB 每块
            while True:
                chunk = response.read(chunk_size)
                if not chunk:
                    break
                f.write(chunk)
                bar.update(len(chunk))



if __name__ == '__main__':
    # 爬取的链接
    url_list = [
        'https://huggingface.co/zcxu-eric/MagicAnimate/tree/main',
        'https://huggingface.co/stabilityai/sd-vae-ft-mse/tree/main',
        'https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main',
    ]
    # 请求头
    head = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    for url in url_list:
        root_dir_name = url.split('/')[-3]
        if not os.path.exists(root_dir_name):
            # 如果不存在，创建文件夹
            os.makedirs(root_dir_name)
            print(f"根目录文件夹 {root_dir_name} 创建成功")
        else:
            print(f"根目录文件夹 {root_dir_name} 已经存在\n")
        # bolg(url, head)
        start_run(url, head, root_dir_name)

华为开发者联盟HarmonyOS专区

鸿蒙生态一站式服务平台。

更多推荐

【grafana】使用教程

华为开发者联盟HarmonyOS专区

【PX4-AutoPilot教程-开发环境】使用VMware虚拟机安装Ubuntu系统并搭建PX4开发环境（ROS+mavros+jMAVSim+gazebo+QGC+QT）

学习PX4开发需要先配置好开发环境，对于新手推荐使用VMware虚拟机搭建Ubuntu系统，并下载PX4源码，配置好编译环境和工具链（ROS操作系统+mavros通信包+jMAVSim仿真+gazebo仿真+QGC地面站+QT开发平台）。教程中使用的是Ubuntu18.04系统（官方推荐使用版本），PX4固件版本为v1.13.0，飞控板为pixhawk2.4.8版本。