{"id":137,"date":"2022-03-10T05:46:13","date_gmt":"2022-03-10T05:46:13","guid":{"rendered":"https:\/\/blog.liguanxin.cn\/?p=137"},"modified":"2022-03-16T05:29:45","modified_gmt":"2022-03-16T05:29:45","slug":"%e8%ae%ba%e6%96%87%e7%ac%94%e8%ae%b0%ef%bc%9aconformer-local-features-coupling-global-representations-for-visual-recognition","status":"publish","type":"post","link":"https:\/\/blog.liguanxin.cn\/index.php\/2022\/03\/10\/%e8%ae%ba%e6%96%87%e7%ac%94%e8%ae%b0%ef%bc%9aconformer-local-features-coupling-global-representations-for-visual-recognition\/","title":{"rendered":"\u8bba\u6587\u7b14\u8bb0\u2014\u2014Conformer: Local Features Coupling Global Representations for Visual Recognition"},"content":{"rendered":"<p><img src=\"https:\/\/blog.liguanxin.cn\/wp-content\/uploads\/2022\/03\/\u5fae\u4fe1\u622a\u56fe_20220310130135.png\" alt=\"\" \/><br \/>\n<strong>\u521b\u65b0\u70b9\uff1a\u540c\u65f6\u5229\u7528CNN\u7684\u6355\u83b7\u5c40\u90e8\u7279\u5f81\u7684\u4f18\u70b9\u548cTransformer\u6355\u83b7\u957f\u8ddd\u79bb\u7279\u5f81\u7684\u4f18\u70b9\u3002<\/strong><\/p>\n<p>\u4e0a\u56fe\u4e2d\u7684(c)\u8868\u793a\u6574\u4e2a\u7f51\u7edc\u7ed3\u6784\u7684\u5e76\u53d1\u6784\u578b\u3002<br \/>\n(b)\u8868\u793a\uff0c\u4e24\u4e2a\u5206\u652f\u7684\u521d\u59cb\u7279\u5f81\u662f\u76f8\u540c\u7684\uff0c\u6cbf\u7740\u4e24\u4e2a\u5206\u652f\u4ee5\u4ea4\u4e92\u7684\u65b9\u5f0f\u9010\u6b65\u878d\u5408\u7279\u5f81\u3002\u6700\u540e\uff0cCNN\u5206\u652f\u5408\u5e76\u8f93\u5165\u7ed9\u4e00\u4e2a\u5206\u7c7b\u5668\uff0cTransformer\u5206\u652f\u7ed9\u53e6\u4e00\u4e2a\u5206\u7c7b\u5668\u3002<\/p>\n<p><strong>CNN\u5206\u652f\u91c7\u7528\u7279\u5f81\u91d1\u5b57\u5854<\/strong>\uff08\u6df1\u5ea6\u589e\u52a0-\u5206\u8fa8\u7387\u964d\u4f4e-\u901a\u9053\u589e\u52a0\uff09<br \/>\n<strong>Transformer\u5757\u628a\u4e0d\u91cd\u53e0\u7684\u56fe\u50cf\u5757\u6295\u5f71\u5230\u5411\u91cf\u7a7a\u95f4<\/strong>\uff08\u5bfc\u81f4\u5c40\u90e8\u7279\u5f81\u6d88\u5931\uff09\u56fe\u50cf\u5757\u5927\u5c0f\u4e3a14*14<br \/>\n\u56e0\u4e3aCNN\u5df2\u7ecf\u5305\u542b\u4e86\u5c40\u90e8\u7279\u5f81\u4fe1\u606f\u548c\u4f4d\u7f6e\u4fe1\u606f\uff0c\u6240\u4ee5Transformer\u4e0d\u518d\u9700\u8981\u4f4d\u7f6e\u7f16\u7801\u3002<\/p>\n<h3>\u7279\u5f81\u8026\u5408\u5355\u5143<\/h3>\n<p>CNN\u7684\u7279\u5f81\u7ef4\u5ea6\u662fC<em>H<\/em>W<br \/>\nTransformer\u7684\u7279\u5f81\u7ef4\u5ea6\u4e3a(K+1)*E\u5176\u4e2dK\u662f\u56fe\u50cf\u5757\u7684\u6570\u91cf\uff0c1\u662f\u5206\u7c7btoken\uff0cE\u662fembedding\u7684\u7ef4\u5ea6\u3002<\/p>\n<ul>\n<li>\u8981\u8ba9CNN\u4f20\u5230Transformer\uff0c\u9996\u5148\u8981\u901a\u8fc71*1\u5377\u79ef\u628achannel\u53d8\u6210E\u3002\u7136\u540e\u5982\u4e0a\u56fe(a)\u5229\u7528\u5e73\u5747\u6c60\u5316\u548creshape\u53d8\u6210Transformer\u7684\u7279\u5f81\u7ef4\u5ea6\uff0c\u4e0eTransformer\u7684\u7279\u5f81\u76f8\u52a0\u3002<\/li>\n<li>\u4eceTransformer\u5230CNN\u5219\u91c7\u7528\u7c7b\u4f3c\u7684\u64cd\u4f5c\uff0c\u5982\u56fe(b)\u3002<\/li>\n<\/ul>\n<p>\u8fd9\u4e00\u8fc7\u7a0b\u91cd\u590d\u6267\u884c\u4e86N\u6b21\uff0c\u5982\u56fe(c)<\/p>\n<p>\u8fd9\u4e00\u7ed3\u6784\u53ef\u4ee5\u62bd\u8c61\u4e3aCNN\u548cTransformer\u7684\u6b8b\u5dee\u7ed3\u6784\uff08\u6216\u8005\u8bf4\u96c6\u6210\u5b66\u4e60\uff09\uff0c\u65e2\u80fd\u770b\u4f5c\u662fCNN\u8fde\u63a5\u4e5f\u80fd\u770b\u4f5c\u662fTransformer\u7684\u8fde\u63a5\u3002<img src=\"https:\/\/blog.liguanxin.cn\/wp-content\/uploads\/2022\/03\/\u5fae\u4fe1\u622a\u56fe_20220310134132.png\" alt=\"\" \/><\/p>\n<h3>\u4ee3\u7801\u89e3\u6790<\/h3>\n<p>ConvTransformer\u5c42<\/p>\n<pre><code class=\"language-python\">    def forward(self, x, x_t):\n        x, x2 = self.cnn_block(x)\n\n        _, _, H, W = x2.shape\n\n        # CNN\u7684\u7279\u5f81\u8f6c\u6362\u6210Transformer\u7684shape\n        x_st = self.squeeze_block(x2, x_t)\n        # \u539f\u672c\u7684transformer\u7279\u5f81\u52a0\u4e0aCNN\u8f6c\u7684\u7279\u5f81\n        x_t = self.trans_block(x_st + x_t)\n\n        if self.num_med_block &gt; 0:\n            for m in self.med_block:\n                x = m(x)\n\n        # transformer\u7279\u5f81\u8f6c\u4e3aCNN\u7279\u5f81\uff0c\u5e76\u4e14\u4e0e\u539f\u6765\u7684\u878d\u5408\n        x_t_r = self.expand_block(x_t, H \/\/ self.dw_stride, W \/\/ self.dw_stride)\n        x = self.fusion_block(x, x_t_r, return_x_2=False)\n\n        return x, x_t<\/code><\/pre>\n<p>\u603b\u4f53Conformer\u7ed3\u6784<\/p>\n<pre><code class=\"language-python\">class Conformer(nn.Module):\n\n    def __init__(self, patch_size=16, in_chans=3, num_classes=1000, base_channel=64, channel_ratio=4, num_med_block=0,\n                 embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None,\n                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.):\n\n        # Transformer\n        super().__init__()\n        self.num_classes = num_classes  #\u5206\u7c7b\u7684\u6570\u91cf\n        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models\n        assert depth % 3 == 0\n\n        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))    # class token\u521d\u59cb\u5316\u4e3a1*1*embed_dim\u7684\u7ef4\u5ea6\uff0c\u7528\u4e8e\u6700\u540e\u7684\u5206\u7c7b\n        self.trans_dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule\n\n        # Classifier head\n        self.trans_norm = nn.LayerNorm(embed_dim)\n        self.trans_cls_head = nn.Linear(embed_dim, num_classes) if num_classes &gt; 0 else nn.Identity()\n        self.pooling = nn.AdaptiveAvgPool2d(1)\n        self.conv_cls_head = nn.Linear(int(256 * channel_ratio), num_classes)\n\n        # Stem stage: get the feature maps by conv block (copied form resnet.py)\u4e5f\u5c31\u662f\u83b7\u5f97CNN\u7684\u7279\u5f81\n        self.conv1 = nn.Conv2d(in_chans, 64, kernel_size=7, stride=2, padding=3, bias=False)  # 1 \/ 2 [112, 112]\n        self.bn1 = nn.BatchNorm2d(64)\n        self.act1 = nn.ReLU(inplace=True)\n        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  # 1 \/ 4 [56, 56]\n\n        # 1 stage\n        stage_1_channel = int(base_channel * channel_ratio)\n        trans_dw_stride = patch_size \/\/ 4\n        self.conv_1 = ConvBlock(inplanes=64, outplanes=stage_1_channel, res_conv=True, stride=1)  #  \u5bf9\u5e94\u56fe\u4e2d\u76841*1 Conv-BN,3*3 Conv-BN,1*1 Conv-BN\n        self.trans_patch_conv = nn.Conv2d(64, embed_dim, kernel_size=trans_dw_stride, stride=trans_dw_stride, padding=0)  # \u628achanel64\u6295\u5f71\u5230embed_dim\u7ef4\uff0c\u540c\u65f6\u75284*4\u5377\u79ef\u63d0\u53d6\u7279\u5f81\u5f97\u523014*14\u4e2a\u7279\u5f81\u5757\uff08\u539f\u5c3a\u5bf8\u4e3aN, 64, 56, 56\uff09\n        self.trans_1 = Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,\n                             qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=self.trans_dpr[0],\n                             )  # transformer\u6a21\u5757\uff0c\u5305\u62ec\u591a\u5934\u81ea\u6ce8\u610f\u529b\u548cmlp\n\n        # 2~4 stage\n        init_stage = 2\n        fin_stage = depth \/\/ 3 + 1\n        # 3\u4e2aConvTransformer\u5c42\n        for i in range(init_stage, fin_stage):\n            self.add_module(&#039;conv_trans_&#039; + str(i),\n                    ConvTransBlock(\n                        stage_1_channel, stage_1_channel, False, 1, dw_stride=trans_dw_stride, embed_dim=embed_dim,\n                        num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,\n                        drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=self.trans_dpr[i-1],\n                        num_med_block=num_med_block\n                    )\n            )\n\n        # \u91d1\u5b57\u5854\u7ed3\u6784channel\u9010\u6e10\u589e\u52a0\n        stage_2_channel = int(base_channel * channel_ratio * 2)\n        # 5~8 stage\n        init_stage = fin_stage # 5\n        fin_stage = fin_stage + depth \/\/ 3 # 9\n\n        for i in range(init_stage, fin_stage):\n            s = 2 if i == init_stage else 1\n            in_channel = stage_1_channel if i == init_stage else stage_2_channel\n            res_conv = True if i == init_stage else False\n            # CNN\u7684\u5206\u8fa8\u7387\u9010\u6e10\u964d\u4f4e\uff0cchannel\u9010\u6e10\u589e\u52a0\uff0cTransformer\u4e0d\u53d8\n            self.add_module(&#039;conv_trans_&#039; + str(i),\n                    ConvTransBlock(\n                        in_channel, stage_2_channel, res_conv, s, dw_stride=trans_dw_stride \/\/ 2, embed_dim=embed_dim,\n                        num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,\n                        drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=self.trans_dpr[i-1],\n                        num_med_block=num_med_block\n                    )\n            )\n        # \u53d8\u5316\u540c\u4e0a\n        stage_3_channel = int(base_channel * channel_ratio * 2 * 2)\n        # 9~12 stage\n        init_stage = fin_stage  # 9\n        fin_stage = fin_stage + depth \/\/ 3  # 13\n        for i in range(init_stage, fin_stage):\n            s = 2 if i == init_stage else 1\n            in_channel = stage_2_channel if i == init_stage else stage_3_channel\n            res_conv = True if i == init_stage else False\n            last_fusion = True if i == depth else False\n            self.add_module(&#039;conv_trans_&#039; + str(i),\n                    ConvTransBlock(\n                        in_channel, stage_3_channel, res_conv, s, dw_stride=trans_dw_stride \/\/ 4, embed_dim=embed_dim,\n                        num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,\n                        drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=self.trans_dpr[i-1],\n                        num_med_block=num_med_block, last_fusion=last_fusion\n                    )\n            )\n        self.fin_stage = fin_stage\n\n        trunc_normal_(self.cls_token, std=.02)\n\n        self.apply(self._init_weights)\n...\n...\n...\n    def forward(self, x):\n        B = x.shape[0]\n        cls_tokens = self.cls_token.expand(B, -1, -1)\n\n        # pdb.set_trace()\n        # stem stage [N, 3, 224, 224] -&gt; [N, 64, 56, 56]\n        x_base = self.maxpool(self.act1(self.bn1(self.conv1(x))))\n\n        # 1 stage\n        x = self.conv_1(x_base, return_x_2=False)\n\n        x_t = self.trans_patch_conv(x_base).flatten(2).transpose(1, 2)\n        x_t = torch.cat([cls_tokens, x_t], dim=1)\n        x_t = self.trans_1(x_t)\n\n        # 2 ~ final \n        for i in range(2, self.fin_stage):\n            x, x_t = eval(&#039;self.conv_trans_&#039; + str(i))(x, x_t)\n\n        # conv classification\n        x_p = self.pooling(x).flatten(1)\n        conv_cls = self.conv_cls_head(x_p)\n\n        # trans classification\n        x_t = self.trans_norm(x_t)\n        tran_cls = self.trans_cls_head(x_t[:, 0])\n\n        return [conv_cls, tran_cls]\n<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u521b\u65b0\u70b9\uff1a\u540c\u65f6\u5229\u7528CNN\u7684\u6355\u83b7\u5c40\u90e8\u7279\u5f81\u7684\u4f18\u70b9\u548cTransformer\u6355\u83b7\u957f\u8ddd\u79bb\u7279\u5f81\u7684\u4f18\u70b9\u3002 \u4e0a\u56fe\u4e2d\u7684(c)\u8868\u793a\u6574 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[6],"tags":[14,13,16,11,12,15],"_links":{"self":[{"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/posts\/137"}],"collection":[{"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/comments?post=137"}],"version-history":[{"count":4,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/posts\/137\/revisions"}],"predecessor-version":[{"id":144,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/posts\/137\/revisions\/144"}],"wp:attachment":[{"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/media?parent=137"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/categories?post=137"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/tags?post=137"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}