04-进阶方向:自然语言处理(NLP)——Hugging Face实战
Hugging Face实战加载模型、分词器、微调、Pipeline一、Hugging Face生态概述1.1 核心组件importnumpyasnpimportmatplotlib.pyplotaspltfrommatplotlib.patchesimportRectangle,FancyBboxPatchimportwarnings warnings.filterwarnings(ignore)print(*60)print(Hugging Face TransformersNLP开发的瑞士军刀)print(*60)# Hugging Face生态图fig,axplt.subplots(figsize(12,8))ax.axis(off)# 中心centerplt.Circle((0.5,0.5),0.12,colorlightcoral,ecblack)ax.add_patch(center)ax.text(0.5,0.5,Hugging\nFace,hacenter,vacenter,fontsize10,fontweightbold)# 周边库libraries{Transformers:(0.15,0.75),Datasets:(0.85,0.75),Tokenizers:(0.15,0.25),Accelerate:(0.85,0.25),PEFT:(0.5,0.85),Gradio:(0.5,0.15),}forlib,(x,y)inlibraries.items():circleplt.Circle((x,y),0.08,colorlightblue,ecblack)ax.add_patch(circle)ax.text(x,y,lib,hacenter,vacenter,fontsize7)# 连接到中心ax.annotate(,xy(x,y),xytext(0.5,0.5),arrowpropsdict(arrowstyle-,colorgray,lw1,alpha0.5))ax.set_xlim(0,1)ax.set_ylim(0,1)ax.set_title(Hugging Face生态系统,fontsize14)plt.tight_layout()plt.show()print(\n Hugging Face核心组件:)print( - Transformers: 模型库数千个预训练模型)print( - Datasets: 数据集库数百个公开数据集)print( - Tokenizers: 高性能分词器)print( - Accelerate: 分布式训练加速)print( - PEFT: 参数高效微调)print( - Gradio: 快速部署演示)二、加载模型与分词器2.1 基础加载defload_model_tokenizer():加载模型和分词器print(\n*60)print(加载模型和分词器)print(*60)code from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification import torch # 1. 加载分词器 tokenizer AutoTokenizer.from_pretrained(bert-base-uncased) # 2. 加载模型 model AutoModel.from_pretrained(bert-base-uncased) # 3. 加载特定任务模型 classifier AutoModelForSequenceClassification.from_pretrained( bert-base-uncased, num_labels2 ) # 4. 查看模型信息 print(f模型名称: {model.config.model_type}) print(f隐藏层维度: {model.config.hidden_size}) print(f层数: {model.config.num_hidden_layers}) print(f注意力头数: {model.config.num_attention_heads}) print(f参数量: {sum(p.numel() for p in model.parameters()):,}) # 5. 基本使用 text Hello, Hugging Face! inputs tokenizer(text, return_tensorspt) outputs model(**inputs) print(f输出形状: {outputs.last_hidden_state.shape}) print(code)load_model_tokenizer()2.2 分词器详解deftokenizer_detailed():分词器详解print(\n*60)print(分词器详解)print(*60)code from transformers import AutoTokenizer tokenizer AutoTokenizer.from_pretrained(bert-base-uncased) # 1. 基本分词 text Hello, Hugging Face! How are you? tokens tokenizer.tokenize(text) print(f分词结果: {tokens}) # 2. 转换为ID ids tokenizer.convert_tokens_to_ids(tokens) print(fToken IDs: {ids}) # 3. 完整编码 encoding tokenizer( text, truncationTrue, # 截断 paddingmax_length, # 填充 max_length128, # 最大长度 return_tensorspt # 返回PyTorch张量 ) print(fInput IDs shape: {encoding[input_ids].shape}) print(fAttention Mask shape: {encoding[attention_mask].shape}) # 4. 批量编码 texts [First sentence., Second sentence., Third sentence.] batch tokenizer( texts, paddingTrue, truncationTrue, return_tensorspt ) print(f批量编码形状: {batch[input_ids].shape}) # 5. 解码 decoded tokenizer.decode(batch[input_ids][0]) print(f解码结果: {decoded}) # 6. 特殊token print(f[CLS] ID: {tokenizer.cls_token_id}) print(f[SEP] ID: {tokenizer.sep_token_id}) print(f[PAD] ID: {tokenizer.pad_token_id}) print(f[MASK] ID: {tokenizer.mask_token_id}) print(code)tokenizer_detailed()三、Pipeline开箱即用3.1 Pipeline基础defpipeline_demo():Pipeline使用print(\n*60)print(Pipeline一行代码完成NLP任务)print(*60)code from transformers import pipeline # 1. 情感分析 classifier pipeline(sentiment-analysis) result classifier(I love this product!) print(f情感分析: {result}) # 2. 文本生成 generator pipeline(text-generation, modelgpt2) result generator(Once upon a time, max_length50, num_return_sequences1) print(f文本生成: {result[0][generated_text]}) # 3. 问答系统 qa pipeline(question-answering) result qa( questionWhat is Hugging Face?, contextHugging Face is a company that develops machine learning tools. ) print(f问答: {result}) # 4. 命名实体识别 ner pipeline(ner, modeldbmdz/bert-large-cased-finetuned-conll03-english) result ner(My name is John and I live in New York.) for entity in result: print(f{entity[word]}: {entity[entity]} (置信度: {entity[score]:.3f})) # 5. 文本摘要 summarizer pipeline(summarization, modelfacebook/bart-large-cnn) text Hugging Face Transformers provides thousands of pretrained models to perform tasks on texts suchasclassification,information extraction,question answering,summarization,translation,text generationandmore. result summarizer(text, max_length30, min_length10) print(f摘要: {result[0][summary_text]}) # 6. 翻译 translator pipeline(translation_en_to_fr, modelt5-small) result translator(Hello, how are you?) print(f翻译: {result[0][translation_text]}) # 7. 零样本分类 classifier pipeline(zero-shot-classification, modelfacebook/bart-large-mnli) result classifier( I love this movie!, candidate_labels[positive, negative, neutral] ) print(f零样本分类: {result[labels][0]} (置信度: {result[scores][0]:.3f})) # 8. 特征提取 feature_extractor pipeline(feature-extraction, modelbert-base-uncased) features feature_extractor(Hello world!) print(f特征形状: {np.array(features).shape}) print(code)pipeline_demo()3.2 自定义Pipelinedefcustom_pipeline():自定义Pipelineprint(\n*60)print(自定义Pipeline)print(*60)code from transformers import Pipeline import torch class CustomNERPipeline(Pipeline): def _sanitize_parameters(self, **kwargs): preprocess_kwargs {} return preprocess_kwargs, {}, {} def preprocess(self, inputs): # 分词 return self.tokenizer(inputs, return_tensorspt, truncationTrue) def _forward(self, model_inputs): # 前向传播 outputs self.model(**model_inputs) return outputs def postprocess(self, model_outputs): # 后处理 logits model_outputs.logits predictions torch.argmax(logits, dim2) return predictions.tolist() # 创建自定义Pipeline ner_pipeline CustomNERPipeline( modelAutoModelForTokenClassification.from_pretrained(bert-base-uncased), tokenizerAutoTokenizer.from_pretrained(bert-base-uncased) ) # 使用 result ner_pipeline(John lives in New York) print(result) print(code)custom_pipeline()四、模型微调4.1 完整微调流程deffinetuning_demo():模型微调完整流程print(\n*60)print(模型微调完整流程)print(*60)code from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding ) from datasets import load_dataset import numpy as np from sklearn.metrics import accuracy_score, f1_score # 1. 加载数据集 dataset load_dataset(imdb) # 2. 加载分词器和模型 model_name bert-base-uncased tokenizer AutoTokenizer.from_pretrained(model_name) model AutoModelForSequenceClassification.from_pretrained(model_name, num_labels2) # 3. 数据预处理 def preprocess_function(examples): return tokenizer( examples[text], truncationTrue, paddingTrue, max_length512 ) tokenized_dataset dataset.map(preprocess_function, batchedTrue) # 4. 数据整理器 data_collator DataCollatorWithPadding(tokenizertokenizer) # 5. 评估指标 def compute_metrics(eval_pred): predictions, labels eval_pred predictions np.argmax(predictions, axis1) return { accuracy: accuracy_score(labels, predictions), f1: f1_score(labels, predictions) } # 6. 训练参数 training_args TrainingArguments( output_dir./results, evaluation_strategyepoch, save_strategyepoch, learning_rate2e-5, per_device_train_batch_size16, per_device_eval_batch_size16, num_train_epochs3, weight_decay0.01, logging_dir./logs, logging_steps100, load_best_model_at_endTrue, metric_for_best_modelaccuracy, ) # 7. 创建Trainer trainer Trainer( modelmodel, argstraining_args, train_datasettokenized_dataset[train], eval_datasettokenized_dataset[test], tokenizertokenizer, data_collatordata_collator, compute_metricscompute_metrics, ) # 8. 训练 trainer.train() # 9. 保存模型 model.save_pretrained(./my_model) tokenizer.save_pretrained(./my_model) # 10. 评估 eval_results trainer.evaluate() print(f评估结果: {eval_results}) # 11. 推理 def predict(text): inputs tokenizer(text, return_tensorspt, truncationTrue, max_length512) outputs model(**inputs) pred torch.argmax(outputs.logits, dim1).item() return 正面 if pred 1 else 负面 print(predict(This movie is great!)) print(code)finetuning_demo()4.2 使用Datasets库defdatasets_demo():Datasets库使用print(\n*60)print(Datasets库使用)print(*60)code from datasets import load_dataset, DatasetDict, concatenate_datasets # 1. 加载内置数据集 dataset load_dataset(imdb) print(f训练集大小: {len(dataset[train])}) print(f测试集大小: {len(dataset[test])}) # 2. 查看数据 sample dataset[train][0] print(f样本: {sample[text][:100]}...) print(f标签: {sample[label]}) # 3. 数据集操作 # 筛选 filtered dataset[train].filter(lambda x: len(x[text]) 200) print(f筛选后: {len(filtered)}) # 映射 def add_prefix(example): example[text] Review: example[text] return example dataset dataset.map(add_prefix) # 4. 划分数据集 splits dataset[train].train_test_split(test_size0.1) dataset DatasetDict({ train: splits[train], validation: splits[test], test: dataset[test] }) # 5. 数据集合并 combined concatenate_datasets([dataset[train], dataset[validation]]) # 6. 流式处理大数据集 streaming_dataset load_dataset(c4, splittrain, streamingTrue) for i, example in enumerate(streaming_dataset): if i 5: break print(example[text][:100]) # 7. 保存和加载 dataset.save_to_disk(./my_dataset) loaded_dataset DatasetDict.load_from_disk(./my_dataset) # 8. 数据集信息 print(dataset) print(f特征: {dataset[train].features}) print(code)datasets_demo()五、高级功能5.1 混合精度训练defmixed_precision():混合精度训练print(\n*60)print(混合精度训练)print(*60)code from transformers import TrainingArguments # 启用混合精度 training_args TrainingArguments( output_dir./results, fp16True, # 启用FP16混合精度 fp16_opt_levelO1, # 优化级别 per_device_train_batch_size32, # 可以更大 ) # 使用Accelerate库 from accelerate import Accelerator accelerator Accelerator( mixed_precisionfp16, # fp16, bf16, or no gradient_accumulation_steps4 ) model, optimizer, dataloader accelerator.prepare(model, optimizer, dataloader) # 训练循环 for batch in dataloader: with accelerator.autocast(): outputs model(**batch) loss outputs.loss accelerator.backward(loss) optimizer.step() optimizer.zero_grad() print(code)mixed_precision()5.2 模型并行与量化defmodel_parallel():模型并行与量化print(\n*60)print(模型并行与量化)print(*60)code from transformers import AutoModelForCausalLM, BitsAndBytesConfig import torch # 1. 模型并行大模型分片 model AutoModelForCausalLM.from_pretrained( gpt2-large, device_mapauto, # 自动分配到可用设备 load_in_8bitTrue, # 8bit量化 ) # 2. 4bit量化 (QLoRA) bnb_config BitsAndBytesConfig( load_in_4bitTrue, bnb_4bit_quant_typenf4, bnb_4bit_use_double_quantTrue, bnb_4bit_compute_dtypetorch.bfloat16 ) model AutoModelForCausalLM.from_pretrained( meta-llama/Llama-2-7b-hf, quantization_configbnb_config, device_mapauto ) # 3. 梯度检查点节省内存 model.gradient_checkpointing_enable() # 4. 查看模型设备分布 print(model.hf_device_map) print(code)model_parallel()六、模型保存与分享6.1 保存与加载defsave_load_model():模型保存与加载print(\n*60)print(模型保存与加载)print(*60)code from transformers import AutoModel, AutoTokenizer # 1. 保存到本地 model.save_pretrained(./my_model) tokenizer.save_pretrained(./my_model) # 2. 从本地加载 model AutoModel.from_pretrained(./my_model) tokenizer AutoTokenizer.from_pretrained(./my_model) # 3. 推送到Hub需要登录 from huggingface_hub import notebook_login notebook_login() # 输入token model.push_to_hub(my-awesome-model) tokenizer.push_to_hub(my-awesome-model) # 4. 从Hub加载 model AutoModel.from_pretrained(username/my-awesome-model) # 5. 保存训练状态 trainer.save_model(./checkpoint) trainer.save_state() # 6. 保存完整状态包括优化器 torch.save({ model_state_dict: model.state_dict(), optimizer_state_dict: optimizer.state_dict(), epoch: epoch, loss: loss, }, checkpoint.pt) print(code)save_load_model()七、总结组件功能常用方法AutoTokenizer分词from_pretrained, tokenize, decodeAutoModel加载模型from_pretrained, save_pretrainedPipeline任务接口pipeline, 自定义PipelineTrainer训练train, evaluate, predictDatasets数据load_dataset, map, filter最佳实践使用AutoModel和AutoTokenizer自动匹配模型使用Pipeline快速验证使用Trainer进行标准训练使用Datasets高效处理数据使用Accelerate进行分布式训练