语义分割从 FCN 到 Segment Anything1. 技术分析1.1 语义分割技术演进语义分割经历了从全卷积网络到大模型的演进语义分割技术路线 FCN (2015) → U-Net (2015) → DeepLab (2016) → Mask R-CNN (2017) → SAM (2023)1.2 分割方法对比方法类型mIoU特点适用场景FCN全卷积62%端到端训练基础分割U-Net编码器-解码器85%医学影像医疗领域DeepLabASPP89%空洞卷积通用分割Mask R-CNN实例分割90%实例级别实例分割SAM大模型95%提示驱动通用分割1.3 分割评估指标语义分割评估指标 mIoU: 平均交并比 Pixel Accuracy: 像素准确率 F1-score: 平衡指标2. 核心功能实现2.1 FCN 实现import torch import torch.nn as nn import torch.nn.functional as F class FCN(nn.Module): def __init__(self, num_classes21): super().__init__() self.conv1 nn.Conv2d(3, 64, kernel_size3, padding1) self.conv2 nn.Conv2d(64, 64, kernel_size3, padding1) self.pool1 nn.MaxPool2d(2, stride2) self.conv3 nn.Conv2d(64, 128, kernel_size3, padding1) self.conv4 nn.Conv2d(128, 128, kernel_size3, padding1) self.pool2 nn.MaxPool2d(2, stride2) self.conv5 nn.Conv2d(128, 256, kernel_size3, padding1) self.conv6 nn.Conv2d(256, 256, kernel_size3, padding1) self.conv7 nn.Conv2d(256, 256, kernel_size3, padding1) self.pool3 nn.MaxPool2d(2, stride2) self.conv8 nn.Conv2d(256, 512, kernel_size3, padding1) self.conv9 nn.Conv2d(512, 512, kernel_size3, padding1) self.conv10 nn.Conv2d(512, 512, kernel_size3, padding1) self.pool4 nn.MaxPool2d(2, stride2) self.conv11 nn.Conv2d(512, 512, kernel_size3, padding1) self.conv12 nn.Conv2d(512, 512, kernel_size3, padding1) self.conv13 nn.Conv2d(512, 512, kernel_size3, padding1) self.pool5 nn.MaxPool2d(2, stride2) self.fc6 nn.Conv2d(512, 4096, kernel_size7) self.fc7 nn.Conv2d(4096, 4096, kernel_size1) self.score_fr nn.Conv2d(4096, num_classes, kernel_size1) self.upscore2 nn.ConvTranspose2d(num_classes, num_classes, kernel_size4, stride2, biasFalse) self.upscore_pool4 nn.ConvTranspose2d(num_classes, num_classes, kernel_size4, stride2, biasFalse) self.upscore_pool3 nn.ConvTranspose2d(num_classes, num_classes, kernel_size16, stride8, biasFalse) self.score_pool4 nn.Conv2d(512, num_classes, kernel_size1) self.score_pool3 nn.Conv2d(256, num_classes, kernel_size1) def forward(self, x): h F.relu(self.conv1(x)) h F.relu(self.conv2(h)) h self.pool1(h) h F.relu(self.conv3(h)) h F.relu(self.conv4(h)) h self.pool2(h) h F.relu(self.conv5(h)) h F.relu(self.conv6(h)) h F.relu(self.conv7(h)) pool3 h self.pool3(h) h F.relu(self.conv8(h)) h F.relu(self.conv9(h)) h F.relu(self.conv10(h)) pool4 h self.pool4(h) h F.relu(self.conv11(h)) h F.relu(self.conv12(h)) h F.relu(self.conv13(h)) h self.pool5(h) h F.relu(self.fc6(h)) h F.dropout(h) h F.relu(self.fc7(h)) h F.dropout(h) h self.score_fr(h) h self.upscore2(h) score_pool4 self.score_pool4(pool4) h h[:, :, 1:1score_pool4.size(2), 1:1score_pool4.size(3)] score_pool4 h self.upscore_pool4(h) score_pool3 self.score_pool3(pool3) h h[:, :, 1:1score_pool3.size(2), 1:1score_pool3.size(3)] score_pool3 h self.upscore_pool3(h) return h[:, :, 31:31x.size(2), 31:31x.size(3)]2.2 U-Net 实现class DoubleConv(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.conv nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size3, padding1), nn.BatchNorm2d(out_channels), nn.ReLU(inplaceTrue), nn.Conv2d(out_channels, out_channels, kernel_size3, padding1), nn.BatchNorm2d(out_channels), nn.ReLU(inplaceTrue) ) def forward(self, x): return self.conv(x) class Down(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.maxpool_conv nn.Sequential( nn.MaxPool2d(2), DoubleConv(in_channels, out_channels) ) def forward(self, x): return self.maxpool_conv(x) class Up(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.up nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size2, stride2) self.conv DoubleConv(in_channels, out_channels) def forward(self, x1, x2): x1 self.up(x1) diffY x2.size()[2] - x1.size()[2] diffX x2.size()[3] - x1.size()[3] x1 F.pad(x1, [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2]) x torch.cat([x2, x1], dim1) return self.conv(x) class OutConv(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.conv nn.Conv2d(in_channels, out_channels, kernel_size1) def forward(self, x): return self.conv(x) class UNet(nn.Module): def __init__(self, n_channels3, n_classes1): super().__init__() self.n_channels n_channels self.n_classes n_classes self.inc DoubleConv(n_channels, 64) self.down1 Down(64, 128) self.down2 Down(128, 256) self.down3 Down(256, 512) self.down4 Down(512, 1024) self.up1 Up(1024, 512) self.up2 Up(512, 256) self.up3 Up(256, 128) self.up4 Up(128, 64) self.outc OutConv(64, n_classes) def forward(self, x): x1 self.inc(x) x2 self.down1(x1) x3 self.down2(x2) x4 self.down3(x3) x5 self.down4(x4) x self.up1(x5, x4) x self.up2(x, x3) x self.up3(x, x2) x self.up4(x, x1) logits self.outc(x) return logits2.3 DeepLab 实现class ASPP(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.conv1 nn.Conv2d(in_channels, out_channels, kernel_size1) self.conv2 nn.Conv2d(in_channels, out_channels, kernel_size3, padding6, dilation6) self.conv3 nn.Conv2d(in_channels, out_channels, kernel_size3, padding12, dilation12) self.conv4 nn.Conv2d(in_channels, out_channels, kernel_size3, padding18, dilation18) self.conv5 nn.Conv2d(in_channels, out_channels, kernel_size1) self.bn1 nn.BatchNorm2d(out_channels) self.bn2 nn.BatchNorm2d(out_channels) self.bn3 nn.BatchNorm2d(out_channels) self.bn4 nn.BatchNorm2d(out_channels) self.bn5 nn.BatchNorm2d(out_channels) self.final_conv nn.Conv2d(out_channels * 5, out_channels, kernel_size1) def forward(self, x): x1 F.relu(self.bn1(self.conv1(x))) x2 F.relu(self.bn2(self.conv2(x))) x3 F.relu(self.bn3(self.conv3(x))) x4 F.relu(self.bn4(self.conv4(x))) x5 F.adaptive_avg_pool2d(x, (1, 1)) x5 F.relu(self.bn5(self.conv5(x5))) x5 F.interpolate(x5, sizex.size()[2:], modebilinear, align_cornersTrue) x torch.cat([x1, x2, x3, x4, x5], dim1) x self.final_conv(x) return x class DeepLab(nn.Module): def __init__(self, num_classes21): super().__init__() self.backbone self._build_backbone() self.aspp ASPP(2048, 256) self.decoder self._build_decoder(num_classes) def _build_backbone(self): return nn.Sequential( nn.Conv2d(3, 64, kernel_size7, stride2, padding3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size3, stride2, padding1) ) def _build_decoder(self, num_classes): return nn.Sequential( nn.Conv2d(256, 256, kernel_size3, padding1), nn.BatchNorm2d(256), nn.ReLU(), nn.Conv2d(256, num_classes, kernel_size1) ) def forward(self, x): x self.backbone(x) x self.aspp(x) x self.decoder(x) x F.interpolate(x, scale_factor8, modebilinear, align_cornersTrue) return x3. 性能对比3.1 语义分割模型对比模型mIoUSpeed(fps)Parameters(M)FCN-8s62%20134U-Net85%3031DeepLabv389%2560Mask R-CNN90%15140SAM95%106003.2 不同数据集表现数据集FCNU-NetDeepLabSAMPascal VOC62%78%89%92%Cityscapes70%75%83%88%COCO65%72%80%90%3.3 模型大小影响模型参数(M)mIoU内存(GB)U-Net Small1078%0.5U-Net3185%1.0DeepLab-Lite2082%0.8DeepLabv36089%2.04. 最佳实践4.1 语义分割模型选择def select_segmentation_model(task_type, constraints): if task_type medical: return UNet(n_classes1) elif constraints.get(speed, False): return DeepLab(model_sizesmall) else: return SAM() class SegmentationFactory: staticmethod def create(config): if config[type] fcn: return FCN(num_classesconfig[num_classes]) elif config[type] unet: return UNet(n_channelsconfig[n_channels], n_classesconfig[n_classes]) elif config[type] deeplab: return DeepLab(num_classesconfig[num_classes])4.2 语义分割训练流程class SegmentationTrainer: def __init__(self, model, optimizer, scheduler, loss_fn): self.model model self.optimizer optimizer self.scheduler scheduler self.loss_fn loss_fn def train_step(self, images, masks): self.optimizer.zero_grad() outputs self.model(images) loss self.loss_fn(outputs, masks) loss.backward() self.optimizer.step() self.scheduler.step() return loss.item() def evaluate(self, dataloader): self.model.eval() total_miou 0 with torch.no_grad(): for images, masks in dataloader: outputs self.model(images) preds torch.argmax(outputs, dim1) miou self._compute_miou(preds, masks) total_miou miou return total_miou / len(dataloader) def _compute_miou(self, preds, masks): intersection (preds masks).sum() union (preds | masks).sum() return intersection.item() / union.item() if union 0 else 05. 总结语义分割技术快速发展FCN开创端到端语义分割U-Net医学影像分割首选DeepLab使用空洞卷积提升感受野SAM大模型时代的通用分割器对比数据如下SAM 在多个数据集上达到最高 mIoUU-Net 在医学影像领域表现出色DeepLab 是通用场景的良好选择推荐根据具体场景选择合适模型