{"id":383,"date":"2022-04-01T10:52:51","date_gmt":"2022-04-01T10:52:51","guid":{"rendered":"https:\/\/blog.liguanxin.cn\/?p=383"},"modified":"2022-04-01T10:52:51","modified_gmt":"2022-04-01T10:52:51","slug":"%e8%ae%ba%e6%96%87%e7%ac%94%e8%ae%b0-transformer-in-convolutional-neural-networks","status":"publish","type":"post","link":"https:\/\/blog.liguanxin.cn\/index.php\/2022\/04\/01\/%e8%ae%ba%e6%96%87%e7%ac%94%e8%ae%b0-transformer-in-convolutional-neural-networks\/","title":{"rendered":"\u8bba\u6587\u7b14\u8bb0\u2014\u2014Transformer in Convolutional Neural Networks"},"content":{"rendered":"<p><strong>\u521b\u65b0\u70b9\uff1a<br \/>\n\u2460\u5c42\u6b21\u591a\u5934\u81ea\u6ce8\u610f\u529b\u673a\u5236\uff0c\u51cf\u5c11\u8ba1\u7b97\/\u7a7a\u95f4\u590d\u6742\u5ea6<br \/>\n\u2461\u7ed3\u5408\u4e86transformer\u548cCNN\u7684\u4f18\u52bf<\/strong><\/p>\n<h1>\u603b\u4f53\u7ed3\u6784<\/h1>\n<p><img src=\"https:\/\/blog.liguanxin.cn\/wp-content\/uploads\/2022\/04\/\u5fae\u4fe1\u622a\u56fe_20220401164546.png\" alt=\"\" \/><br \/>\nGAP\uff1a\u5168\u5c40\u5e73\u5747\u6c60\u5316<br \/>\nFC\uff1a\u5168\u8fde\u63a5\u5c42<br \/>\nDW Conv\uff1a\u6df1\u5ea6\u53ef\u5206\u79bb\u5377\u79ef<br \/>\nIRB\uff1a\u53cd\u5411\u6b8b\u5dee\u74f6\u9888\u5c42<br \/>\nTDB\uff1a\u53cc\u5206\u652f\u4e0b\u91c7\u6837\u6a21\u5757<br \/>\nLi\u4ee3\u8868H-MHSA\u548cIRB\u5c42\u91cd\u590dLi\u6b21\uff0cS\u4ee3\u8868\u5377\u79ef\u6b65\u957f\uff0cSiLU\u662f\u975e\u7ebf\u6027\u51fd\u6570\u3002<\/p>\n<h1>\u5c42\u6b21\u591a\u5934\u81ea\u6ce8\u610f\u529bHierarchical Multi-Head Self-Attention(H-MHSA)<\/h1>\n<p>\u76f8\u5f53\u4e8e\u628aH\u548cW\u7f29\u5c0fG\u500d\uff0c\u53ef\u4ee5\u7406\u89e3\u4e3a\u628a\u4e00\u5f20\u56fe\u7247\u62c6\u5206\u6210\u591a\u4e2aG*G\u5927\u5c0f\u7684\u56fe\u50cf\u5757\u53bb\u8ba1\u7b97\u81ea\u6ce8\u610f\u529b\u3002<br \/>\n<img src=\"https:\/\/blog.liguanxin.cn\/wp-content\/uploads\/2022\/04\/\u5fae\u4fe1\u622a\u56fe_20220401165322.png\" alt=\"\" \/><\/p>\n<ul>\n<li>\u5047\u8bbe\u8f93\u5165\u7279\u5f81\u56feX\u5c3a\u5bf8\u4e3a<span class=\"katex-eq\" data-katex-display=\"false\">H0*W0*C<\/span>\uff0c\u5219<span class=\"katex-eq\" data-katex-display=\"false\">N=H0*W0<\/span>\uff0c\u628a\u7279\u5f81\u56fe\u5212\u5206\u4e3aG0*G0\u7684\u5c0f\u5757\uff0c\u6b64\u65f6X&#8217;\u5c3a\u5bf8\u4e3a<span class=\"katex-eq\" data-katex-display=\"false\">(H0\/G0*W0\/G0)*(G0*G0)*C<\/span>\u3002<\/li>\n<li>\u5728\u8fd9\u4e2a\u5c3a\u5bf8\u4e0b\u505a\u81ea\u6ce8\u610f\u529b\uff0c\u4ee4<span class=\"katex-eq\" data-katex-display=\"false\">Q=X'W^q<\/span>\uff0c<span class=\"katex-eq\" data-katex-display=\"false\">K=X'W^k<\/span>\u548c<span class=\"katex-eq\" data-katex-display=\"false\">V=X'W^v<\/span>\uff0c\u5176\u4e2d<span class=\"katex-eq\" data-katex-display=\"false\">W^q,W^v,W^k\u7684\u5c3a\u5bf8\u90fd\u662fC*C<\/span><\/li>\n<li>\u81ea\u6ce8\u610f\u529b\u5b8c\u4e86\u4e4b\u540e\u5f97\u5230A0\u518d\u628a\u5c3a\u5bf8\u6062\u590d\u4e3a<span class=\"katex-eq\" data-katex-display=\"false\">H0*W0*C<\/span>\uff0c\u518d\u52a0\u4e0a\u6b8b\u5dee\u4f7fA0=X+A0<\/li>\n<li>\u7136\u540e\u901a\u8fc7\u8fd9\u4e2a\u516c\u5f0f\u83b7\u5f97\u7b2c\u4e8c\u4e2a\u521d\u59cb\u56fe<span class=\"katex-eq\" data-katex-display=\"false\">A0'=MaxPool_G1(A0)+AvePool(A0)<\/span>\uff0c\u7136\u540e\u91cd\u590d\u524d\u4e24\u6b65\u5212\u5206\u4e3aG1*G1\u7684\u5c0f\u5757\uff0c\u7b49\u3002<\/li>\n<li>\u6700\u540e\u5f97\u5230\u591a\u5934\u81ea\u6ce8\u610f\u529b\u7684\u516c\u5f0f\u4e3a\uff08\u5176\u4e2dupsample\u4e3a\u4e0a\u91c7\u6837\uff09<br \/>\n<img src=\"https:\/\/blog.liguanxin.cn\/wp-content\/uploads\/2022\/04\/\u5fae\u4fe1\u622a\u56fe_20220401172639.png\" alt=\"\" \/><\/li>\n<\/ul>\n<h1>CODE<\/h1>\n<p>\u6574\u4f53\u7ed3\u6784<\/p>\n<pre><code class=\"language-python\">class TransCNN(t.nn.Module):\n    def __init__(self,\n                 num_classes,\n                 in_channels = 3,\n                 g_sizes = [[8, 4, 2], [7, 4, 2], [3, 2, 2], [2, 2, 2]],\n                 exp_ratios = [4, 4, 6, 5],\n                 repeats = [2, 2, 2, 2]):\n        super(TransCNN, self).__init__()\n        ...\n    def forward(self, x):\n\n        # 1. \u4e24\u5957Conv2d\u3001BatchNorm2d\u3001ReLU\u4e09\u8fde\n        x_conv = self.conv(x)\n        x = x_conv\n\n        # 2. \u57fa\u672cTransCNNBlock\u6a21\u5757\uff0c\u5206\u4e3a4\u4e2astage\n        for l in range(4):\n            transcnn_block = self.stages[l]\n            x = transcnn_block(x)\n\n        # 3. \u5168\u5c40\u5e73\u5747\u6c60\u5316\n        x_avg = self.avg(x)\n        x_avg = x_avg.squeeze()\n\n        # 7. \u7ebf\u6027\u5c42\u548c\u5206\u7c7b\u5668\n        out = self.cls(x_avg)\n\n        return out<\/code><\/pre>\n<p>TransCNNBlock\u6a21\u5757<\/p>\n<pre><code class=\"language-python\">class TransCNNBlock(t.nn.Module):\n    &quot;&quot;&quot;Define TransCNN Block&quot;&quot;&quot;\n\n    def __init__(self, in_channels, embed_dim, g_size, exp_ratio, kernel_size = 3):\n        super(TransCNNBlock, self).__init__()\n\n        # 1. \u5c42\u6b21\u591a\u5934\u81ea\u6ce8\u610f\u529b\u673a\u5236\n        self.hmhsa = HMHSA(in_channels, g_size = g_size, out_channels = embed_dim)\n\n        # 2. \u53cd\u5411\u6b8b\u5dee\u74f6\u9888\u5c42\n        self.irb = IRB(in_channels, exp_ratio = exp_ratio, kernel_size = kernel_size)\n\n    def forward(self, x):\n        x = self.hmhsa(x)\n        x = self.irb(x)\n        return x<\/code><\/pre>\n<p>\u5c42\u6b21\u591a\u5934\u81ea\u6ce8\u610f\u529b\u673a\u5236<br \/>\n<img src=\"https:\/\/blog.liguanxin.cn\/wp-content\/uploads\/2022\/04\/\u5fae\u4fe1\u622a\u56fe_20220401184902.png\" alt=\"\" \/><br \/>\n<img src=\"https:\/\/blog.liguanxin.cn\/wp-content\/uploads\/2022\/04\/\u5fae\u4fe1\u622a\u56fe_20220401184849.png\" alt=\"\" \/><\/p>\n<pre><code class=\"language-python\">class HMHSA(t.nn.Module):\n    &quot;&quot;&quot;Define HMHSA module&quot;&quot;&quot;\n\n    def __init__(self, in_channels, out_channels, g_size = [8, 4, 2]):\n        ...\n    def forward(self, x):\n        &quot;&quot;&quot;x has size [m, c, h, w]&quot;&quot;&quot;\n        # 1. step 0\n        x_0 = self.mg0(x)  # \u7b2c\u4e00\u6b65\uff0c\u628a\u56fe\u50cf\u5212\u5206\u6210G0*G0\u5927\u5c0f\u7684\u5757\n        a_0 = self.mhsa0(x_0)  # qkv\u4e58\u6cd5\n        a_0 = a_0.unsqueeze(dim = 3)  # \u6062\u590d\u5c3a\u5bf8[m, h, w, 1, c]\n        a_0 = t.matmul(a_0, self.W_p0).squeeze().permute(0, -1, 1, 2) + x # \u4e58\u4ee5\u6743\u91cd\u77e9\u9635\uff0c\u518d\u505a\u6b8b\u5dee\u8fde\u63a5\uff0c\u5982\u4e0a\u56fe # transformation # [m, c, h, w]\n        a_0_ = a_0\n        a_0 = a_0.permute(0, 2, 3, 1)\n        a_0 = self.mlp0(a_0)  # \u6700\u540e\u7684mlp\u518d\u505a\u6b8b\u5dee\u8fde\u63a5\uff0c\u5982\u4e0a\u56fe\n        a_0 = a_0.permute(0, -1, 1, 2) + a_0_  # \u6b8b\u5dee\u8fde\u63a5\n        x_0 = self.max_pool0(a_0) + self.avg_pool0(a_0)\n\n        # 2. step 1\uff08\u91cd\u590d\u4ee5\u4e0a\u6b65\u9aa4\uff09\n        x_1 = self.mg1(x_0)\n        a_1 = self.mhsa1(x_1)\n        a_1 = a_1.unsqueeze(dim = 3)  # [m, h, w, 1, c]\n        a_1 = t.matmul(a_1, self.W_p1).squeeze().permute(0, -1, 1, 2) + x_0  # transformation # [m, c, h, w]\n        a_1_ = a_1\n        a_1 = a_1.permute(0, 2, 3, 1)\n        a_1 = self.mlp1(a_1)\n        a_1 = a_1.permute(0, -1, 1, 2) + a_1_\n        x_1 = self.max_pool1(a_1) + self.avg_pool1(a_1)\n\n        # 3. step 2\uff08\u91cd\u590d\u4ee5\u4e0a\u6b65\u9aa4\uff09\n        x_2 = self.mg2(x_1)\n        a_2 = self.mhsa2(x_2)\n        a_2 = a_2.unsqueeze(dim = 3)  # [m, h, w, 1, c]\n        a_2 = t.matmul(a_2, self.W_p2).squeeze().permute(0, -1, 1, 2) + x_1  # transformation # [m, c, h, w]\n        a_2_ = a_2\n        a_2 = a_2.permute(0, 2, 3, 1)\n        a_2 = self.mlp0(a_2)\n        a_2 = a_2.permute(0, -1, 1, 2) + a_2_\n\n        # 4. Upsample\uff08\u4e0a\u91c7\u6837\u518d\u7d2f\u52a0\uff09\n        a_1 = self.upsample1(a_1)\n        a_2 = self.upsample2(a_2)\n        output = a_0 + a_1 + a_2\n\n        return output<\/code><\/pre>\n<p>IRB\u6a21\u5757<br \/>\n<img src=\"https:\/\/blog.liguanxin.cn\/wp-content\/uploads\/2022\/04\/\u5fae\u4fe1\u622a\u56fe_20220401185132.png\" alt=\"\" \/><\/p>\n<pre><code class=\"language-python\">class IRB(t.nn.Module):\n    &quot;&quot;&quot;Define IRB module&quot;&quot;&quot;\n\n    def __init__(self, in_channels, exp_ratio, kernel_size = 3):\n        &quot;&quot;&quot;\n        Args :\n            --in_channel: input channels\n            --exp_ratio: expansion ratio\n            --kernel_size: default is 3\n        &quot;&quot;&quot;\n        super(IRB, self).__init__()\n\n        hid_channels = int(exp_ratio * in_channels)\n        self.layers = t.nn.Sequential(\n            t.nn.Conv2d(in_channels = in_channels, out_channels = hid_channels, kernel_size = 1),\n            t.nn.BatchNorm2d(hid_channels),\n            t.nn.SiLU(),\n\n            t.nn.Conv2d(in_channels = hid_channels, out_channels = hid_channels, kernel_size = kernel_size, padding = kernel_size \/\/ 2, groups = hid_channels),\n            t.nn.BatchNorm2d(hid_channels),\n            t.nn.SiLU(),\n\n            t.nn.Conv2d(in_channels = hid_channels, out_channels = in_channels, kernel_size = 1),\n            t.nn.BatchNorm2d(in_channels)\n        )\n\n    def forward(self, x):\n        x_ = self.layers(x)\n\n        return x + x_<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u521b\u65b0\u70b9\uff1a \u2460\u5c42\u6b21\u591a\u5934\u81ea\u6ce8\u610f\u529b\u673a\u5236\uff0c\u51cf\u5c11\u8ba1\u7b97\/\u7a7a\u95f4\u590d\u6742\u5ea6 \u2461\u7ed3\u5408\u4e86transformer\u548cCNN\u7684\u4f18\u52bf \u603b\u4f53\u7ed3\u6784 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[6],"tags":[14,13,17,11],"_links":{"self":[{"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/posts\/383"}],"collection":[{"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/comments?post=383"}],"version-history":[{"count":0,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/posts\/383\/revisions"}],"wp:attachment":[{"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/media?parent=383"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/categories?post=383"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/blog.liguanxin.cn\/index.php\/wp-json\/wp\/v2\/tags?post=383"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}