-
Notifications
You must be signed in to change notification settings - Fork 967
/
MixtureTable.lua
165 lines (148 loc) · 5.38 KB
/
MixtureTable.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
local MixtureTable, parent = torch.class('nn.MixtureTable', 'nn.Module')
function MixtureTable:__init(dim)
parent.__init(self)
self.dim = dim
self.size = torch.LongStorage()
self.batchSize = 0
self.size2 = torch.LongStorage()
self.backwardSetup = false
self.gradInput = {}
end
function MixtureTable:updateOutput(input)
local gaterInput, expertInputs = table.unpack(input)
-- buffers
self._gaterView = self._gaterView or input[1].new()
self._expert = self._expert or input[1].new()
self._expertView = self._expertView or input[1].new()
self.dimG = 2
local batchSize = gaterInput:size(1)
if gaterInput:dim() < 2 then
self.dimG = 1
self.dim = self.dim or 1
batchSize = 1
end
self.dim = self.dim or 2
if self.table or torch.type(expertInputs) == 'table' then
-- expertInputs is a Table :
self.table = true
if gaterInput:size(self.dimG) ~= #expertInputs then
error"Should be one gater output per expert"
end
local expertInput = expertInputs[1]
self.size:resize(expertInput:dim()+1):fill(1)
if self.dimG > 1 then
self.size[1] = gaterInput:size(1)
end
self.size[self.dim] = gaterInput:size(self.dimG)
self.output:resizeAs(expertInput)
self.batchSize = batchSize
self._gaterView:view(gaterInput, self.size)
self.output:zero()
-- multiply accumulate gater outputs by their commensurate expert
for i,expertInput in ipairs(expertInputs) do
local gate = self._gaterView:select(self.dim,i):expandAs(expertInput)
self.output:addcmul(expertInput, gate)
end
else
-- expertInputs is a Tensor :
self.size:resize(expertInputs:dim()):fill(1)
if self.dimG > 1 then
self.size[1] = gaterInput:size(1)
end
self.size[self.dim] = gaterInput:size(self.dimG)
self.output:resizeAs(expertInputs:select(self.dim, 1))
self.batchSize = batchSize
self._gaterView:view(gaterInput, self.size)
self._expert:cmul(self._gaterView:expandAs(expertInputs), expertInputs)
self.output:sum(self._expert, self.dim)
self.output:resizeAs(expertInputs:select(self.dim, 1))
end
return self.output
end
function MixtureTable:updateGradInput(input, gradOutput)
local gaterInput, expertInputs = table.unpack(input)
nn.utils.recursiveResizeAs(self.gradInput, input)
local gaterGradInput, expertGradInputs = table.unpack(self.gradInput)
-- buffers
self._sum = self._sum or input[1].new()
self._expertView2 = self._expertView2 or input[1].new()
self._expert2 = self._expert2 or input[1].new()
if self.table then
for i,expertInput in ipairs(expertInputs) do
local expertGradInput = expertGradInputs[i] or expertInput:clone()
expertGradInput:resizeAs(expertInput)
expertGradInputs[i] = expertGradInput
end
gaterGradInput:resizeAs(gaterInput)
-- Clear invalid gradients
if #expertGradInputs > #expertInputs then
for i=#expertInputs+1, #expertGradInputs do
expertGradInputs[i] = nil
end
end
-- like CMulTable, but with broadcasting
for i,expertGradInput in ipairs(expertGradInputs) do
-- gater updateGradInput
self._expert:cmul(gradOutput, expertInputs[i])
if self.dimG == 1 then
self._expertView:view(self._expert, -1)
else
self._expertView:view(self._expert, gradOutput:size(1), -1)
end
self._sum:sum(self._expertView, self.dimG)
if self.dimG == 1 then
gaterGradInput[i] = self._sum:select(self.dimG,1)
else
gaterGradInput:select(self.dimG,i):copy(self._sum:select(self.dimG,1))
end
-- expert updateGradInput
local gate = self._gaterView:select(self.dim,i):expandAs(expertGradInput)
expertGradInput:cmul(gate, gradOutput)
end
else
self.size2:resize(expertInputs:dim())
self.size2:copy(expertInputs:size())
self.size2[self.dim] = 1
gaterGradInput:resizeAs(gaterInput)
-- gater updateGradInput
self._expertView:view(gradOutput, self.size2)
local gradOutput = self._expertView:expandAs(expertInputs)
self._expert:cmul(gradOutput, expertInputs)
local expert = self._expert:transpose(self.dim, self.dimG)
if not expert:isContiguous() then
self._expert2:resizeAs(expert)
self._expert2:copy(expert)
expert = self._expert2
end
if self.dimG == 1 then
self._expertView2:view(expert, gaterInput:size(1), -1)
else
self._expertView2:view(expert, gaterInput:size(1), gaterInput:size(2), -1)
end
gaterGradInput:sum(self._expertView2, self.dimG+1)
gaterGradInput:resizeAs(gaterInput)
-- expert updateGradInput
expertGradInputs:cmul(self._gaterView:expandAs(expertInputs), gradOutput)
end
return self.gradInput
end
function MixtureTable:type(type, tensorCache)
self._gaterView = nil
self._expert = nil
self._expertView = nil
self._sum = nil
self._expert2 = nil
self._expertView2 = nil
return parent.type(self, type, tensorCache)
end
function MixtureTable:clearState()
nn.utils.clear(self, {
'_gaterView',
'_expert',
'_expertView',
'_sum',
'_expert2',
'_expertView2',
})
return parent.clearState(self)
end