tinygrad/docs/ramp.py at master · Player256/tinygrad · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#!/usr/bin/env python3

# this file is a "ramp" for people new to tinygrad to think about how to approach it
# it is runnable and editable.
# whenever you see stuff like DEBUG=2 or CPU=1 discussed, these are environment variables
# in a unix shell like bash `DEBUG=2 CPU=1 python docs/ramp.py`

# this pip installs tinygrad master for the system
# the -e allows you to edit the tinygrad folder and update system tinygrad
# tinygrad is pure Python, so you are encouraged to do this
# git pull in the tinygrad directory will also get you the latest
"""
git clone https://github.com/tinygrad/tinygrad.git
cd tinygrad
python3 -m pip install -e .
"""

# %% ********
print("******* PART 1 *******")

# we start with a Device.
# a Device is where Tensors are stored and compute is run
# tinygrad autodetects the best device on your system and makes it the DEFAULT
from tinygrad import Device
print(Device.DEFAULT)  # on Mac, you can see this prints METAL

# now, lets create a Tensor
from tinygrad import Tensor, dtypes
t = Tensor([1,2,3,4])

# you can see this Tensor is on the DEFAULT device with int dtype and shape (4,)
assert t.device == Device.DEFAULT
assert t.dtype == dtypes.int
assert t.shape == (4,)

# unlike in torch, if we print it, it doesn't print the contents
# this is because tinygrad is lazy
# this Tensor has not been computed yet
print(t)
# <Tensor <UOp METAL (4,) int (<Ops.COPY: 7>, None)> on METAL with grad None>

# the ".uop" property on Tensor contains the specification of how to compute it
print(t.uop)
"""
UOp(Ops.COPY, dtypes.int, arg=None, src=(
  UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
    UOp(Ops.UNIQUE, dtypes.void, arg=0, src=()),
    UOp(Ops.DEVICE, dtypes.void, arg='PYTHON', src=()),)),
  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# as you can see, it's specifying a copy from PYTHON device
# which is where the [1,2,3,4] array lives

# UOps are the specification language in tinygrad
# they are immutable and form a DAG
# they have a "Ops", a "dtype", a tuple of srcs (parents), and an arg

t.realize()
# if we want to "realize" a tensor, we can with the "realize" method
# now when we look at the uop, it's changed
print(t.uop)
"""
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
  UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# the copy was actually run, and now the "uop" of the Tensor is just a BUFFER
# if you run this script with DEBUG=2 in the environment, you can see the copy happen
# *** METAL      1 copy       16,   METAL <- PYTHON ...

# now let's do some compute
# we look at the uop to see the specification of the compute
t_times_2 = t * 2
print(t_times_2.uop)
"""
UOp(Ops.MUL, dtypes.int, arg=None, src=(
  UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
    UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
    x2:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)),
  UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
    UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
      UOp(Ops.CONST, dtypes.int, arg=2, src=(
        UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
           x2,)),)),)),)),))
"""
# the BUFFER from above is being multiplied by a CONST 2
# it's RESHAPEd and EXPANDed to broadcast the CONST to the BUFFER

# we can check the result with
assert t_times_2.tolist() == [2, 4, 6, 8]

# UOps are both immutable and globally unique
# if i multiply the Tensor by 4 twice, these result Tensors will have the same uop specification
t_times_4_try_1 = t * 4
t_times_4_try_2 = t * 4
assert t_times_4_try_1.uop is t_times_4_try_2.uop
# the specification isn't just the same, it's the exact same Python object
assert t_times_4_try_1 is not t_times_4_try_2
# the Tensor is a different Python object

# if we realize `t_times_4_try_1` ...
t_times_4_try_1.realize()
print(t_times_4_try_2.uop)
"""
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
  UOp(Ops.UNIQUE, dtypes.void, arg=4, src=()),
  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# ... `t_times_4_try_2` also becomes the same BUFFER
assert t_times_4_try_1.uop is t_times_4_try_2.uop
# so this print doesn't require any computation, just a copy back to the CPU so we can print it
print("** only the copy start")
print(t_times_4_try_2.tolist())  # [4, 8, 12, 16]
print("** only the copy end")
# you can confirm this with DEBUG=2, seeing what's printed in between the "**" prints

# tinygrad has an auto differentiation engine that operates according to these same principles
# the derivative of "log(x)" is "1/x", and you can see this on line 20 of gradient.py
t_float = Tensor([3.0])
t_log = t_float.log()
t_log_grad, = t_log.sum().gradient(t_float)
# due to how log is implemented, this gradient contains a lot of UOps
print(t_log_grad.uop)
# ...not shown here...
# but if you run with DEBUG=4 (CPU=1 used here for simpler code), you can see the generated code
"""
void E_(float* restrict data0, float* restrict data1) {
  float val0 = *(data1+0);
  *(data0+0) = (0.6931471805599453f*(1/(val0*0.6931471805599453f)));
}
"""
# the derivative is close to 1/3
assert (t_log_grad.item() - 1/3) < 1e-6

# %% ********
print("******* PART 2 *******")

# we redefine the same t here so this cell can run on it's own
from tinygrad import Tensor
t = Tensor([1,2,3,4])

# what's above gives you enough of an understanding to go use tinygrad as a library
# however, a lot of the beauty of tinygrad is in how easy it is to interact with the internals
# NOTE: the APIs here are subject to change

t_plus_3_plus_4 = t + 3 + 4
print(t_plus_3_plus_4.uop)
"""
UOp(Ops.ADD, dtypes.int, arg=None, src=(
  UOp(Ops.ADD, dtypes.int, arg=None, src=(
    UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
      UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
      x3:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
    UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
      UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
        UOp(Ops.CONST, dtypes.int, arg=3, src=(
          x7:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
             x3,)),)),)),)),)),
  UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
    UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
      UOp(Ops.CONST, dtypes.int, arg=4, src=(
         x7,)),)),)),))
"""
# you can see it's adding both 3 and 4

# but by the time we are actually running the code, it's adding 7
# `kernelize` will simplify and group the operations in the graph into kernels
t_plus_3_plus_4.kernelize()
print(t_plus_3_plus_4.uop)
"""
UOp(Ops.ASSIGN, dtypes.int, arg=None, src=(
  x0:=UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
    UOp(Ops.UNIQUE, dtypes.void, arg=7, src=()),
    x2:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
  UOp(Ops.KERNEL, dtypes.void, arg=<Kernel 12 SINK(<Ops.STORE: 48>,) (__add__,)>, src=(
     x0,
    UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
      UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
       x2,)),)),))
"""
# ASSIGN has two srcs, src[0] is the BUFFER that's assigned to, and src[1] is the thing to assign
# src[1] is the GPU Kernel that's going to be run
# we can get the ast of the Kernel as follows
kernel_ast = t_plus_3_plus_4.uop.src[1].arg.ast

# almost everything in tinygrad functions as a rewrite of the UOps
# the codegen rewrites the ast to a simplified form ready for "rendering"
from tinygrad.codegen import full_rewrite_to_sink
rewritten_ast = full_rewrite_to_sink(kernel_ast)
print(rewritten_ast)
"""
UOp(Ops.SINK, dtypes.void, arg=None, src=(
  UOp(Ops.STORE, dtypes.void, arg=None, src=(
    UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
      UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=0, src=()),
      x3:=UOp(Ops.SPECIAL, dtypes.int, arg=('gidx0', 4), src=()),)),
    UOp(Ops.ADD, dtypes.int, arg=None, src=(
      UOp(Ops.LOAD, dtypes.int, arg=None, src=(
        UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
          UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=1, src=()),
           x3,)),)),
      UOp(Ops.CONST, dtypes.int, arg=7, src=()),)),)),))
"""
# you can see at this point we are adding 7, not 3 and 4

# with DEBUG=4, we can see the code.
# since optimizations are on, it UPCASTed the operation, explicitly writing out all 4 +7s
t_plus_3_plus_4.realize()
"""
void E_4n2(int* restrict data0, int* restrict data1) {
  int val0 = *(data1+0);
  int val1 = *(data1+1);
  int val2 = *(data1+2);
  int val3 = *(data1+3);
  *(data0+0) = (val0+7);
  *(data0+1) = (val1+7);
  *(data0+2) = (val2+7);
  *(data0+3) = (val3+7);
}
"""
# the function name E_4n2 is "E" for elementwise op (as opposed to "r" for reduce op)
# "4" for the size, and "n2" for name deduping (it's the 3rd function with the same E and 4 in this session)
# when you print the name with DEBUG=2, you'll see the 4 is yellow, meaning that it's upcasted
# if you run with NOOPT=1 ...
"""
void E_4n2(int* restrict data0, int* restrict data1) {
  for (int ridx0 = 0; ridx0 < 4; ridx0++) {
    int val0 = *(data1+ridx0);
    *(data0+ridx0) = (val0+7);
  }
}
"""
# ... you get this unoptimized code with a loop and the 4 is blue (for global). the color code is in kernel.py

# %% ********
print("******* PART 3 *******")

# now, we go even lower and understand UOps better and how the graph rewrite engine works.
# it's much simpler than what's in LLVM or MLIR

from tinygrad import dtypes
from tinygrad.uop.ops import UOp, Ops

# first, we'll construct some const UOps
a = UOp(Ops.CONST, dtypes.int, arg=2)
b = UOp(Ops.CONST, dtypes.int, arg=2)

# if you have been paying attention, you should know these are the same Python object
assert a is b

# UOps support normal Python math operations, so a_plus_b expresses the spec for 2 + 2
a_plus_b = a + b
print(a_plus_b)
"""
UOp(Ops.ADD, dtypes.int, arg=None, src=(
  x0:=UOp(Ops.CONST, dtypes.int, arg=2, src=()),
   x0,))
"""

# we could actually render this 2+2 into a language like c and run it
# or, we can use tinygrad's graph rewrite engine to "constant fold"

from tinygrad.uop.ops import graph_rewrite, UPat, PatternMatcher

# a `PatternMatcher` is a list of tuples. for each element in the list:
# [0] is the pattern to match, and [1] is the function to run.
# this function can return either a UOp to replace the pattern with, or None to not replace
simple_pm = PatternMatcher([
  (UPat(Ops.ADD, src=(UPat(Ops.CONST, name="c1"), UPat(Ops.CONST, name="c2"))),
   lambda c1,c2: UOp(Ops.CONST, dtype=c1.dtype, arg=c1.arg+c2.arg)),
])
# this pattern matches the addition of two CONST and rewrites it into a single CONST UOp

# to actually apply the pattern to a_plus_b, we use graph_rewrite
a_plus_b_simplified = graph_rewrite(a_plus_b, simple_pm)
print(a_plus_b_simplified)
"""
UOp(Ops.CONST, dtypes.int, arg=4, src=())
"""
# 2+2 is in fact, 4

# we can also use syntactic sugar to write the pattern nicer
simpler_pm = PatternMatcher([
  (UPat.cvar("c1")+UPat.cvar("c2"), lambda c1,c2: c1.const_like(c1.arg+c2.arg))
])
assert graph_rewrite(a_plus_b, simple_pm) is graph_rewrite(a_plus_b, simpler_pm)
# note again the use of is, UOps are immutable and globally unique

# %% ********

# that brings you to an understanding of the most core concepts in tinygrad
# you can run this with VIZ=1 to use the web based graph rewrite explorer
# hopefully now you understand it. the nodes in the graph are just UOps