Tensor Tiling Library
 
Loading...
Searching...
No Matches
TTL_simplex_scheme.h
Go to the documentation of this file.
1/*
2 * TTL_simplex_scheme.h
3 *
4 * Copyright (c) 2023 Mobileye
5 *
6 * Licensed under the Apache License, Version 2.0 (the License);
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19// clang-format off
20/**
21 * @file
22 *
23 * TTL_simplex_buffering pipelines a pair of import and export transactions using
24 * three internal buffers, in rotation: each buffer interchangeably serves as input
25 * buffer and output buffer, such that in each iteration one buffer is used both to
26 * export then import and two buffers are used by compute for reading and writing.
27 *
28 * With simplex buffering we're only waiting for previous iterations, so DMA
29 * transactions run mostly in parallel to computation, but serially with each
30 * other. Using the same buffer both for import and export is possible allowing us
31 * to overlap exporting from and importing to the same buffer.
32 *
33 * The following table draws the pipelined actions performed in simplex buffering.
34 * It specifies which tile is processed in each iteration:
35 *
36 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
37 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
38 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
39 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
40 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
41 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
42 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
43 *
44 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
45 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
46 *
47 * @example TTL_simplex_buffering.cl
48 */
49// clang-format on
50
51// This file presumes that the following have been pre included.
52// this is not done here for path reasons.
53// #include "TTL_core.h"
54// #include "TTL_import_export.h"
55// #include TTL_IMPORT_EXPORT_INCLUDE_H
56
57/**
58 * @def The structs used for this buffering type
59 */
60// TTL_simplex_buffering_t
61typedef struct {
62 struct {
63 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
64 0->1->0->1... etc */
65 __local void *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
66 TTL_ext_void_tensor_t ext_tensor_in; /*!< The external tensor being input */
67 TTL_ext_void_tensor_t ext_tensor_out; /*!< The external tensor being output */
68 } common; ///< The information that is common to all pipeline schemes
69
72 // Cache previous gotten tiles.
74 TTL_int_void_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
76
77/**
78 * Simple declarations for file ordering purposes
79 */
80static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(
81 TTL_simplex_const_void_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
82 TTL_tile_t tile_current_export);
83
84/**
85 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
86 *
87 * @param int_base1 The address of the first buffer to be used in local memory
88 * @param int_base2 The address of the second buffer to be used in local memory
89 * @param int_base3 The address of the third buffer to be used in local memory
90 * @param ext_tensor_in The external tensor to import the input data from
91 * @param ext_tensor_out The external tensor to export the output data to
92 * @param event_in A pointer to the event to use for the inward (external to
93 * internal) transfer completion
94 * @param event_out A pointer to the event to use for the inward (internal to
95 * external) transfer completion
96 * @param first_tile The first tile to fetch for the scheme
97 *
98 * Solid description of TTL_double_double_buffering_t buffering here
99 *
100 * @return The TTL_simplex_buffering_t created from the input parameters
101 *
102 * Example:
103 * @code
104 * TTL_event_t tb_e_in = TTL_get_event();
105 * TTL_event_t tb_e_out = TTL_get_event();
106 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
107 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
108 * ext_layout_out, &tb_e_in, &tb_e_out);
109 * @endcode
110 * \n
111 *
112 * This can be optimized and standardized using the TTL_step_buffering
113 * call.
114 *
115 * @startuml
116 *
117 * start
118 *
119 *
120 * stop
121 *
122 * @enduml
123 *
124 */
126 __local void *int_base1, __local void *int_base2, __local void *int_base3, TTL_ext_void_tensor_t ext_tensor_in,
127 TTL_ext_void_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
129
130 result.common.int_base[0] = int_base1;
131 result.common.int_base[1] = int_base2;
132 result.common.int_base[2] = int_base3;
133 result.common.ext_tensor_in = ext_tensor_in;
134 result.common.ext_tensor_out = ext_tensor_out;
135 result.event_in = event_in;
136 result.event_out = event_out;
138
139 result.common.index = 0;
140
142
143 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
144
145 return result;
146}
147
148static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(
149 TTL_simplex_const_void_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
150 TTL_tile_t tile_current_export) {
151 // For performance, compute everything possible before waiting for the previous operations to finish. The current
152 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
153 const TTL_layout_t next_import_layout =
154 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
155 const TTL_int_void_sub_tensor_t next_import_int_sub_tensor =
156 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
157 tile_next_import.shape,
158 next_import_layout,
159 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
160 tile_next_import.offset);
161 const TTL_const_ext_void_tensor_t next_import_ext_tensor =
163 tile_next_import.shape,
164 simplex_buffer->common.ext_tensor_in.layout,
165 tile_next_import.offset,
166 simplex_buffer->common.ext_tensor_in.elem_size);
167
168 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
169 simplex_buffer->next_exported_tile.shape.height);
170 const TTL_int_void_tensor_t int_export_tensor =
171 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
172 simplex_buffer->next_exported_tile.shape,
173 int_export_layout,
174 simplex_buffer->common.ext_tensor_out.elem_size);
175 const TTL_ext_void_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
176 simplex_buffer->next_exported_tile.shape,
177 simplex_buffer->common.ext_tensor_out.layout,
178 simplex_buffer->next_exported_tile.offset,
179 simplex_buffer->common.ext_tensor_out.elem_size);
180
181 // Wait for the previous (import/export)s to complete before starting the next.
182 TTL_wait(1, simplex_buffer->event_out);
183 TTL_wait(1, simplex_buffer->event_in);
184
185 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
187 *TTL_to_void_tensor(&export_to),
188 simplex_buffer->event_out);
189
190 if (TTL_tile_empty(tile_next_import) == false)
191 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
192 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
193 simplex_buffer->event_in);
194
195 // The import/export has been started for the current tile, Move to the next
196 // tile.
197 simplex_buffer->common.index =
198 (simplex_buffer->common.index + 1) %
199 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
200
201 // Retrieve buffer imported previously to read from now.
202 const TTL_int_void_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
203 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
204
205 // Can write to out buffer according to size of curr_tile, rather than size
206 // recently exported.
207 const TTL_layout_t curr_int_layout =
208 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
209 const TTL_int_void_sub_tensor_t int_curr_buff_out =
210 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
211 tile_current_export.shape,
212 curr_int_layout,
213 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
214 tile_current_export.offset);
215
216 // Save last two tiles to prevent common repeated get_tile()'s.
217 simplex_buffer->next_exported_tile = tile_current_export;
218
219 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
220}
221
222static inline void __attribute__((overloadable)) TTL_finish_buffering(
223 TTL_simplex_const_void_tensor_buffering_t *const simplex_buffering) {
226}
227/*
228 * TTL_simplex_scheme.h
229 *
230 * Copyright (c) 2023 Mobileye
231 *
232 * Licensed under the Apache License, Version 2.0 (the License);
233 * you may not use this file except in compliance with the License.
234 * You may obtain a copy of the License at
235 *
236 * http://www.apache.org/licenses/LICENSE-2.0
237 *
238 * Unless required by applicable law or agreed to in writing, software
239 * distributed under the License is distributed on an AS IS BASIS,
240 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
241 * See the License for the specific language governing permissions and
242 * limitations under the License.
243 */
244
245// clang-format off
246/**
247 * @file
248 *
249 * TTL_simplex_buffering pipelines a pair of import and export transactions using
250 * three internal buffers, in rotation: each buffer interchangeably serves as input
251 * buffer and output buffer, such that in each iteration one buffer is used both to
252 * export then import and two buffers are used by compute for reading and writing.
253 *
254 * With simplex buffering we're only waiting for previous iterations, so DMA
255 * transactions run mostly in parallel to computation, but serially with each
256 * other. Using the same buffer both for import and export is possible allowing us
257 * to overlap exporting from and importing to the same buffer.
258 *
259 * The following table draws the pipelined actions performed in simplex buffering.
260 * It specifies which tile is processed in each iteration:
261 *
262 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
263 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
264 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
265 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
266 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
267 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
268 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
269 *
270 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
271 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
272 *
273 * @example TTL_simplex_buffering.cl
274 */
275// clang-format on
276
277// This file presumes that the following have been pre included.
278// this is not done here for path reasons.
279// #include "TTL_core.h"
280// #include "TTL_import_export.h"
281// #include TTL_IMPORT_EXPORT_INCLUDE_H
282
283/**
284 * @def The structs used for this buffering type
285 */
286// TTL_simplex_buffering_t
287typedef struct {
288 struct {
289 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
290 0->1->0->1... etc */
291 __local char *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
292 TTL_ext_char_tensor_t ext_tensor_in; /*!< The external tensor being input */
293 TTL_ext_char_tensor_t ext_tensor_out; /*!< The external tensor being output */
294 } common; ///< The information that is common to all pipeline schemes
295
298 // Cache previous gotten tiles.
300 TTL_int_char_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
302
303/**
304 * Simple declarations for file ordering purposes
305 */
306static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(
307 TTL_simplex_const_char_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
308 TTL_tile_t tile_current_export);
309
310/**
311 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
312 *
313 * @param int_base1 The address of the first buffer to be used in local memory
314 * @param int_base2 The address of the second buffer to be used in local memory
315 * @param int_base3 The address of the third buffer to be used in local memory
316 * @param ext_tensor_in The external tensor to import the input data from
317 * @param ext_tensor_out The external tensor to export the output data to
318 * @param event_in A pointer to the event to use for the inward (external to
319 * internal) transfer completion
320 * @param event_out A pointer to the event to use for the inward (internal to
321 * external) transfer completion
322 * @param first_tile The first tile to fetch for the scheme
323 *
324 * Solid description of TTL_double_double_buffering_t buffering here
325 *
326 * @return The TTL_simplex_buffering_t created from the input parameters
327 *
328 * Example:
329 * @code
330 * TTL_event_t tb_e_in = TTL_get_event();
331 * TTL_event_t tb_e_out = TTL_get_event();
332 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
333 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
334 * ext_layout_out, &tb_e_in, &tb_e_out);
335 * @endcode
336 * \n
337 *
338 * This can be optimized and standardized using the TTL_step_buffering
339 * call.
340 *
341 * @startuml
342 *
343 * start
344 *
345 *
346 * stop
347 *
348 * @enduml
349 *
350 */
352 __local char *int_base1, __local char *int_base2, __local char *int_base3, TTL_ext_char_tensor_t ext_tensor_in,
353 TTL_ext_char_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
355
356 result.common.int_base[0] = int_base1;
357 result.common.int_base[1] = int_base2;
358 result.common.int_base[2] = int_base3;
359 result.common.ext_tensor_in = ext_tensor_in;
360 result.common.ext_tensor_out = ext_tensor_out;
361 result.event_in = event_in;
362 result.event_out = event_out;
364
365 result.common.index = 0;
366
368
369 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
370
371 return result;
372}
373
374static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(
375 TTL_simplex_const_char_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
376 TTL_tile_t tile_current_export) {
377 // For performance, compute everything possible before waiting for the previous operations to finish. The current
378 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
379 const TTL_layout_t next_import_layout =
380 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
381 const TTL_int_char_sub_tensor_t next_import_int_sub_tensor =
382 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
383 tile_next_import.shape,
384 next_import_layout,
385 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
386 tile_next_import.offset);
387 const TTL_const_ext_char_tensor_t next_import_ext_tensor =
389 tile_next_import.shape,
390 simplex_buffer->common.ext_tensor_in.layout,
391 tile_next_import.offset,
392 simplex_buffer->common.ext_tensor_in.elem_size);
393
394 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
395 simplex_buffer->next_exported_tile.shape.height);
396 const TTL_int_char_tensor_t int_export_tensor =
397 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
398 simplex_buffer->next_exported_tile.shape,
399 int_export_layout,
400 simplex_buffer->common.ext_tensor_out.elem_size);
401 const TTL_ext_char_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
402 simplex_buffer->next_exported_tile.shape,
403 simplex_buffer->common.ext_tensor_out.layout,
404 simplex_buffer->next_exported_tile.offset,
405 simplex_buffer->common.ext_tensor_out.elem_size);
406
407 // Wait for the previous (import/export)s to complete before starting the next.
408 TTL_wait(1, simplex_buffer->event_out);
409 TTL_wait(1, simplex_buffer->event_in);
410
411 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
413 *TTL_to_void_tensor(&export_to),
414 simplex_buffer->event_out);
415
416 if (TTL_tile_empty(tile_next_import) == false)
417 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
418 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
419 simplex_buffer->event_in);
420
421 // The import/export has been started for the current tile, Move to the next
422 // tile.
423 simplex_buffer->common.index =
424 (simplex_buffer->common.index + 1) %
425 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
426
427 // Retrieve buffer imported previously to read from now.
428 const TTL_int_char_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
429 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
430
431 // Can write to out buffer according to size of curr_tile, rather than size
432 // recently exported.
433 const TTL_layout_t curr_int_layout =
434 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
435 const TTL_int_char_sub_tensor_t int_curr_buff_out =
436 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
437 tile_current_export.shape,
438 curr_int_layout,
439 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
440 tile_current_export.offset);
441
442 // Save last two tiles to prevent common repeated get_tile()'s.
443 simplex_buffer->next_exported_tile = tile_current_export;
444
445 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
446}
447
448static inline void __attribute__((overloadable)) TTL_finish_buffering(
449 TTL_simplex_const_char_tensor_buffering_t *const simplex_buffering) {
452}
453/*
454 * TTL_simplex_scheme.h
455 *
456 * Copyright (c) 2023 Mobileye
457 *
458 * Licensed under the Apache License, Version 2.0 (the License);
459 * you may not use this file except in compliance with the License.
460 * You may obtain a copy of the License at
461 *
462 * http://www.apache.org/licenses/LICENSE-2.0
463 *
464 * Unless required by applicable law or agreed to in writing, software
465 * distributed under the License is distributed on an AS IS BASIS,
466 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
467 * See the License for the specific language governing permissions and
468 * limitations under the License.
469 */
470
471// clang-format off
472/**
473 * @file
474 *
475 * TTL_simplex_buffering pipelines a pair of import and export transactions using
476 * three internal buffers, in rotation: each buffer interchangeably serves as input
477 * buffer and output buffer, such that in each iteration one buffer is used both to
478 * export then import and two buffers are used by compute for reading and writing.
479 *
480 * With simplex buffering we're only waiting for previous iterations, so DMA
481 * transactions run mostly in parallel to computation, but serially with each
482 * other. Using the same buffer both for import and export is possible allowing us
483 * to overlap exporting from and importing to the same buffer.
484 *
485 * The following table draws the pipelined actions performed in simplex buffering.
486 * It specifies which tile is processed in each iteration:
487 *
488 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
489 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
490 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
491 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
492 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
493 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
494 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
495 *
496 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
497 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
498 *
499 * @example TTL_simplex_buffering.cl
500 */
501// clang-format on
502
503// This file presumes that the following have been pre included.
504// this is not done here for path reasons.
505// #include "TTL_core.h"
506// #include "TTL_import_export.h"
507// #include TTL_IMPORT_EXPORT_INCLUDE_H
508
509/**
510 * @def The structs used for this buffering type
511 */
512// TTL_simplex_buffering_t
513typedef struct {
514 struct {
515 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
516 0->1->0->1... etc */
517 __local uchar *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
518 TTL_ext_uchar_tensor_t ext_tensor_in; /*!< The external tensor being input */
519 TTL_ext_uchar_tensor_t ext_tensor_out; /*!< The external tensor being output */
520 } common; ///< The information that is common to all pipeline schemes
521
524 // Cache previous gotten tiles.
526 TTL_int_uchar_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
528
529/**
530 * Simple declarations for file ordering purposes
531 */
532static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(
533 TTL_simplex_const_uchar_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
534 TTL_tile_t tile_current_export);
535
536/**
537 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
538 *
539 * @param int_base1 The address of the first buffer to be used in local memory
540 * @param int_base2 The address of the second buffer to be used in local memory
541 * @param int_base3 The address of the third buffer to be used in local memory
542 * @param ext_tensor_in The external tensor to import the input data from
543 * @param ext_tensor_out The external tensor to export the output data to
544 * @param event_in A pointer to the event to use for the inward (external to
545 * internal) transfer completion
546 * @param event_out A pointer to the event to use for the inward (internal to
547 * external) transfer completion
548 * @param first_tile The first tile to fetch for the scheme
549 *
550 * Solid description of TTL_double_double_buffering_t buffering here
551 *
552 * @return The TTL_simplex_buffering_t created from the input parameters
553 *
554 * Example:
555 * @code
556 * TTL_event_t tb_e_in = TTL_get_event();
557 * TTL_event_t tb_e_out = TTL_get_event();
558 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
559 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
560 * ext_layout_out, &tb_e_in, &tb_e_out);
561 * @endcode
562 * \n
563 *
564 * This can be optimized and standardized using the TTL_step_buffering
565 * call.
566 *
567 * @startuml
568 *
569 * start
570 *
571 *
572 * stop
573 *
574 * @enduml
575 *
576 */
578 __local uchar *int_base1, __local uchar *int_base2, __local uchar *int_base3, TTL_ext_uchar_tensor_t ext_tensor_in,
579 TTL_ext_uchar_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
581
582 result.common.int_base[0] = int_base1;
583 result.common.int_base[1] = int_base2;
584 result.common.int_base[2] = int_base3;
585 result.common.ext_tensor_in = ext_tensor_in;
586 result.common.ext_tensor_out = ext_tensor_out;
587 result.event_in = event_in;
588 result.event_out = event_out;
590
591 result.common.index = 0;
592
594
595 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
596
597 return result;
598}
599
600static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(
601 TTL_simplex_const_uchar_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
602 TTL_tile_t tile_current_export) {
603 // For performance, compute everything possible before waiting for the previous operations to finish. The current
604 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
605 const TTL_layout_t next_import_layout =
606 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
607 const TTL_int_uchar_sub_tensor_t next_import_int_sub_tensor =
608 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
609 tile_next_import.shape,
610 next_import_layout,
611 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
612 tile_next_import.offset);
613 const TTL_const_ext_uchar_tensor_t next_import_ext_tensor =
615 tile_next_import.shape,
616 simplex_buffer->common.ext_tensor_in.layout,
617 tile_next_import.offset,
618 simplex_buffer->common.ext_tensor_in.elem_size);
619
620 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
621 simplex_buffer->next_exported_tile.shape.height);
622 const TTL_int_uchar_tensor_t int_export_tensor =
623 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
624 simplex_buffer->next_exported_tile.shape,
625 int_export_layout,
626 simplex_buffer->common.ext_tensor_out.elem_size);
628 simplex_buffer->next_exported_tile.shape,
629 simplex_buffer->common.ext_tensor_out.layout,
630 simplex_buffer->next_exported_tile.offset,
631 simplex_buffer->common.ext_tensor_out.elem_size);
632
633 // Wait for the previous (import/export)s to complete before starting the next.
634 TTL_wait(1, simplex_buffer->event_out);
635 TTL_wait(1, simplex_buffer->event_in);
636
637 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
639 *TTL_to_void_tensor(&export_to),
640 simplex_buffer->event_out);
641
642 if (TTL_tile_empty(tile_next_import) == false)
643 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
644 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
645 simplex_buffer->event_in);
646
647 // The import/export has been started for the current tile, Move to the next
648 // tile.
649 simplex_buffer->common.index =
650 (simplex_buffer->common.index + 1) %
651 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
652
653 // Retrieve buffer imported previously to read from now.
654 const TTL_int_uchar_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
655 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
656
657 // Can write to out buffer according to size of curr_tile, rather than size
658 // recently exported.
659 const TTL_layout_t curr_int_layout =
660 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
661 const TTL_int_uchar_sub_tensor_t int_curr_buff_out =
662 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
663 tile_current_export.shape,
664 curr_int_layout,
665 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
666 tile_current_export.offset);
667
668 // Save last two tiles to prevent common repeated get_tile()'s.
669 simplex_buffer->next_exported_tile = tile_current_export;
670
671 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
672}
673
674static inline void __attribute__((overloadable)) TTL_finish_buffering(
675 TTL_simplex_const_uchar_tensor_buffering_t *const simplex_buffering) {
678}
679/*
680 * TTL_simplex_scheme.h
681 *
682 * Copyright (c) 2023 Mobileye
683 *
684 * Licensed under the Apache License, Version 2.0 (the License);
685 * you may not use this file except in compliance with the License.
686 * You may obtain a copy of the License at
687 *
688 * http://www.apache.org/licenses/LICENSE-2.0
689 *
690 * Unless required by applicable law or agreed to in writing, software
691 * distributed under the License is distributed on an AS IS BASIS,
692 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
693 * See the License for the specific language governing permissions and
694 * limitations under the License.
695 */
696
697// clang-format off
698/**
699 * @file
700 *
701 * TTL_simplex_buffering pipelines a pair of import and export transactions using
702 * three internal buffers, in rotation: each buffer interchangeably serves as input
703 * buffer and output buffer, such that in each iteration one buffer is used both to
704 * export then import and two buffers are used by compute for reading and writing.
705 *
706 * With simplex buffering we're only waiting for previous iterations, so DMA
707 * transactions run mostly in parallel to computation, but serially with each
708 * other. Using the same buffer both for import and export is possible allowing us
709 * to overlap exporting from and importing to the same buffer.
710 *
711 * The following table draws the pipelined actions performed in simplex buffering.
712 * It specifies which tile is processed in each iteration:
713 *
714 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
715 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
716 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
717 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
718 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
719 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
720 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
721 *
722 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
723 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
724 *
725 * @example TTL_simplex_buffering.cl
726 */
727// clang-format on
728
729// This file presumes that the following have been pre included.
730// this is not done here for path reasons.
731// #include "TTL_core.h"
732// #include "TTL_import_export.h"
733// #include TTL_IMPORT_EXPORT_INCLUDE_H
734
735/**
736 * @def The structs used for this buffering type
737 */
738// TTL_simplex_buffering_t
739typedef struct {
740 struct {
741 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
742 0->1->0->1... etc */
743 __local int *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
744 TTL_ext_int_tensor_t ext_tensor_in; /*!< The external tensor being input */
745 TTL_ext_int_tensor_t ext_tensor_out; /*!< The external tensor being output */
746 } common; ///< The information that is common to all pipeline schemes
747
750 // Cache previous gotten tiles.
752 TTL_int_int_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
754
755/**
756 * Simple declarations for file ordering purposes
757 */
758static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(
759 TTL_simplex_const_int_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
760 TTL_tile_t tile_current_export);
761
762/**
763 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
764 *
765 * @param int_base1 The address of the first buffer to be used in local memory
766 * @param int_base2 The address of the second buffer to be used in local memory
767 * @param int_base3 The address of the third buffer to be used in local memory
768 * @param ext_tensor_in The external tensor to import the input data from
769 * @param ext_tensor_out The external tensor to export the output data to
770 * @param event_in A pointer to the event to use for the inward (external to
771 * internal) transfer completion
772 * @param event_out A pointer to the event to use for the inward (internal to
773 * external) transfer completion
774 * @param first_tile The first tile to fetch for the scheme
775 *
776 * Solid description of TTL_double_double_buffering_t buffering here
777 *
778 * @return The TTL_simplex_buffering_t created from the input parameters
779 *
780 * Example:
781 * @code
782 * TTL_event_t tb_e_in = TTL_get_event();
783 * TTL_event_t tb_e_out = TTL_get_event();
784 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
785 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
786 * ext_layout_out, &tb_e_in, &tb_e_out);
787 * @endcode
788 * \n
789 *
790 * This can be optimized and standardized using the TTL_step_buffering
791 * call.
792 *
793 * @startuml
794 *
795 * start
796 *
797 *
798 * stop
799 *
800 * @enduml
801 *
802 */
804 __local int *int_base1, __local int *int_base2, __local int *int_base3, TTL_ext_int_tensor_t ext_tensor_in,
805 TTL_ext_int_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
807
808 result.common.int_base[0] = int_base1;
809 result.common.int_base[1] = int_base2;
810 result.common.int_base[2] = int_base3;
811 result.common.ext_tensor_in = ext_tensor_in;
812 result.common.ext_tensor_out = ext_tensor_out;
813 result.event_in = event_in;
814 result.event_out = event_out;
816
817 result.common.index = 0;
818
820
821 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
822
823 return result;
824}
825
826static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(
827 TTL_simplex_const_int_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
828 TTL_tile_t tile_current_export) {
829 // For performance, compute everything possible before waiting for the previous operations to finish. The current
830 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
831 const TTL_layout_t next_import_layout =
832 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
833 const TTL_int_int_sub_tensor_t next_import_int_sub_tensor =
834 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
835 tile_next_import.shape,
836 next_import_layout,
837 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
838 tile_next_import.offset);
839 const TTL_const_ext_int_tensor_t next_import_ext_tensor =
841 tile_next_import.shape,
842 simplex_buffer->common.ext_tensor_in.layout,
843 tile_next_import.offset,
844 simplex_buffer->common.ext_tensor_in.elem_size);
845
846 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
847 simplex_buffer->next_exported_tile.shape.height);
848 const TTL_int_int_tensor_t int_export_tensor =
849 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
850 simplex_buffer->next_exported_tile.shape,
851 int_export_layout,
852 simplex_buffer->common.ext_tensor_out.elem_size);
853 const TTL_ext_int_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
854 simplex_buffer->next_exported_tile.shape,
855 simplex_buffer->common.ext_tensor_out.layout,
856 simplex_buffer->next_exported_tile.offset,
857 simplex_buffer->common.ext_tensor_out.elem_size);
858
859 // Wait for the previous (import/export)s to complete before starting the next.
860 TTL_wait(1, simplex_buffer->event_out);
861 TTL_wait(1, simplex_buffer->event_in);
862
863 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
865 *TTL_to_void_tensor(&export_to),
866 simplex_buffer->event_out);
867
868 if (TTL_tile_empty(tile_next_import) == false)
869 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
870 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
871 simplex_buffer->event_in);
872
873 // The import/export has been started for the current tile, Move to the next
874 // tile.
875 simplex_buffer->common.index =
876 (simplex_buffer->common.index + 1) %
877 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
878
879 // Retrieve buffer imported previously to read from now.
880 const TTL_int_int_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
881 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
882
883 // Can write to out buffer according to size of curr_tile, rather than size
884 // recently exported.
885 const TTL_layout_t curr_int_layout =
886 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
887 const TTL_int_int_sub_tensor_t int_curr_buff_out =
888 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
889 tile_current_export.shape,
890 curr_int_layout,
891 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
892 tile_current_export.offset);
893
894 // Save last two tiles to prevent common repeated get_tile()'s.
895 simplex_buffer->next_exported_tile = tile_current_export;
896
897 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
898}
899
900static inline void __attribute__((overloadable)) TTL_finish_buffering(
901 TTL_simplex_const_int_tensor_buffering_t *const simplex_buffering) {
904}
905/*
906 * TTL_simplex_scheme.h
907 *
908 * Copyright (c) 2023 Mobileye
909 *
910 * Licensed under the Apache License, Version 2.0 (the License);
911 * you may not use this file except in compliance with the License.
912 * You may obtain a copy of the License at
913 *
914 * http://www.apache.org/licenses/LICENSE-2.0
915 *
916 * Unless required by applicable law or agreed to in writing, software
917 * distributed under the License is distributed on an AS IS BASIS,
918 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
919 * See the License for the specific language governing permissions and
920 * limitations under the License.
921 */
922
923// clang-format off
924/**
925 * @file
926 *
927 * TTL_simplex_buffering pipelines a pair of import and export transactions using
928 * three internal buffers, in rotation: each buffer interchangeably serves as input
929 * buffer and output buffer, such that in each iteration one buffer is used both to
930 * export then import and two buffers are used by compute for reading and writing.
931 *
932 * With simplex buffering we're only waiting for previous iterations, so DMA
933 * transactions run mostly in parallel to computation, but serially with each
934 * other. Using the same buffer both for import and export is possible allowing us
935 * to overlap exporting from and importing to the same buffer.
936 *
937 * The following table draws the pipelined actions performed in simplex buffering.
938 * It specifies which tile is processed in each iteration:
939 *
940 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
941 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
942 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
943 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
944 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
945 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
946 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
947 *
948 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
949 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
950 *
951 * @example TTL_simplex_buffering.cl
952 */
953// clang-format on
954
955// This file presumes that the following have been pre included.
956// this is not done here for path reasons.
957// #include "TTL_core.h"
958// #include "TTL_import_export.h"
959// #include TTL_IMPORT_EXPORT_INCLUDE_H
960
961/**
962 * @def The structs used for this buffering type
963 */
964// TTL_simplex_buffering_t
965typedef struct {
966 struct {
967 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
968 0->1->0->1... etc */
969 __local uint *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
970 TTL_ext_uint_tensor_t ext_tensor_in; /*!< The external tensor being input */
971 TTL_ext_uint_tensor_t ext_tensor_out; /*!< The external tensor being output */
972 } common; ///< The information that is common to all pipeline schemes
973
976 // Cache previous gotten tiles.
978 TTL_int_uint_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
980
981/**
982 * Simple declarations for file ordering purposes
983 */
984static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(
985 TTL_simplex_const_uint_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
986 TTL_tile_t tile_current_export);
987
988/**
989 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
990 *
991 * @param int_base1 The address of the first buffer to be used in local memory
992 * @param int_base2 The address of the second buffer to be used in local memory
993 * @param int_base3 The address of the third buffer to be used in local memory
994 * @param ext_tensor_in The external tensor to import the input data from
995 * @param ext_tensor_out The external tensor to export the output data to
996 * @param event_in A pointer to the event to use for the inward (external to
997 * internal) transfer completion
998 * @param event_out A pointer to the event to use for the inward (internal to
999 * external) transfer completion
1000 * @param first_tile The first tile to fetch for the scheme
1001 *
1002 * Solid description of TTL_double_double_buffering_t buffering here
1003 *
1004 * @return The TTL_simplex_buffering_t created from the input parameters
1005 *
1006 * Example:
1007 * @code
1008 * TTL_event_t tb_e_in = TTL_get_event();
1009 * TTL_event_t tb_e_out = TTL_get_event();
1010 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1011 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1012 * ext_layout_out, &tb_e_in, &tb_e_out);
1013 * @endcode
1014 * \n
1015 *
1016 * This can be optimized and standardized using the TTL_step_buffering
1017 * call.
1018 *
1019 * @startuml
1020 *
1021 * start
1022 *
1023 *
1024 * stop
1025 *
1026 * @enduml
1027 *
1028 */
1030 __local uint *int_base1, __local uint *int_base2, __local uint *int_base3, TTL_ext_uint_tensor_t ext_tensor_in,
1031 TTL_ext_uint_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
1033
1034 result.common.int_base[0] = int_base1;
1035 result.common.int_base[1] = int_base2;
1036 result.common.int_base[2] = int_base3;
1037 result.common.ext_tensor_in = ext_tensor_in;
1038 result.common.ext_tensor_out = ext_tensor_out;
1039 result.event_in = event_in;
1040 result.event_out = event_out;
1042
1043 result.common.index = 0;
1044
1046
1047 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1048
1049 return result;
1050}
1051
1052static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1053 TTL_simplex_const_uint_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1054 TTL_tile_t tile_current_export) {
1055 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1056 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1057 const TTL_layout_t next_import_layout =
1058 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1059 const TTL_int_uint_sub_tensor_t next_import_int_sub_tensor =
1060 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1061 tile_next_import.shape,
1062 next_import_layout,
1063 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1064 tile_next_import.offset);
1065 const TTL_const_ext_uint_tensor_t next_import_ext_tensor =
1067 tile_next_import.shape,
1068 simplex_buffer->common.ext_tensor_in.layout,
1069 tile_next_import.offset,
1070 simplex_buffer->common.ext_tensor_in.elem_size);
1071
1072 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1073 simplex_buffer->next_exported_tile.shape.height);
1074 const TTL_int_uint_tensor_t int_export_tensor =
1075 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1076 simplex_buffer->next_exported_tile.shape,
1077 int_export_layout,
1078 simplex_buffer->common.ext_tensor_out.elem_size);
1079 const TTL_ext_uint_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
1080 simplex_buffer->next_exported_tile.shape,
1081 simplex_buffer->common.ext_tensor_out.layout,
1082 simplex_buffer->next_exported_tile.offset,
1083 simplex_buffer->common.ext_tensor_out.elem_size);
1084
1085 // Wait for the previous (import/export)s to complete before starting the next.
1086 TTL_wait(1, simplex_buffer->event_out);
1087 TTL_wait(1, simplex_buffer->event_in);
1088
1089 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1091 *TTL_to_void_tensor(&export_to),
1092 simplex_buffer->event_out);
1093
1094 if (TTL_tile_empty(tile_next_import) == false)
1095 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1096 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
1097 simplex_buffer->event_in);
1098
1099 // The import/export has been started for the current tile, Move to the next
1100 // tile.
1101 simplex_buffer->common.index =
1102 (simplex_buffer->common.index + 1) %
1103 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
1104
1105 // Retrieve buffer imported previously to read from now.
1106 const TTL_int_uint_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
1107 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
1108
1109 // Can write to out buffer according to size of curr_tile, rather than size
1110 // recently exported.
1111 const TTL_layout_t curr_int_layout =
1112 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
1113 const TTL_int_uint_sub_tensor_t int_curr_buff_out =
1114 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1115 tile_current_export.shape,
1116 curr_int_layout,
1117 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1118 tile_current_export.offset);
1119
1120 // Save last two tiles to prevent common repeated get_tile()'s.
1121 simplex_buffer->next_exported_tile = tile_current_export;
1122
1123 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
1124}
1125
1126static inline void __attribute__((overloadable)) TTL_finish_buffering(
1127 TTL_simplex_const_uint_tensor_buffering_t *const simplex_buffering) {
1130}
1131/*
1132 * TTL_simplex_scheme.h
1133 *
1134 * Copyright (c) 2023 Mobileye
1135 *
1136 * Licensed under the Apache License, Version 2.0 (the License);
1137 * you may not use this file except in compliance with the License.
1138 * You may obtain a copy of the License at
1139 *
1140 * http://www.apache.org/licenses/LICENSE-2.0
1141 *
1142 * Unless required by applicable law or agreed to in writing, software
1143 * distributed under the License is distributed on an AS IS BASIS,
1144 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1145 * See the License for the specific language governing permissions and
1146 * limitations under the License.
1147 */
1148
1149// clang-format off
1150/**
1151 * @file
1152 *
1153 * TTL_simplex_buffering pipelines a pair of import and export transactions using
1154 * three internal buffers, in rotation: each buffer interchangeably serves as input
1155 * buffer and output buffer, such that in each iteration one buffer is used both to
1156 * export then import and two buffers are used by compute for reading and writing.
1157 *
1158 * With simplex buffering we're only waiting for previous iterations, so DMA
1159 * transactions run mostly in parallel to computation, but serially with each
1160 * other. Using the same buffer both for import and export is possible allowing us
1161 * to overlap exporting from and importing to the same buffer.
1162 *
1163 * The following table draws the pipelined actions performed in simplex buffering.
1164 * It specifies which tile is processed in each iteration:
1165 *
1166 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
1167 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
1168 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
1169 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
1170 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1171 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
1172 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1173 *
1174 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
1175 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
1176 *
1177 * @example TTL_simplex_buffering.cl
1178 */
1179// clang-format on
1180
1181// This file presumes that the following have been pre included.
1182// this is not done here for path reasons.
1183// #include "TTL_core.h"
1184// #include "TTL_import_export.h"
1185// #include TTL_IMPORT_EXPORT_INCLUDE_H
1186
1187/**
1188 * @def The structs used for this buffering type
1189 */
1190// TTL_simplex_buffering_t
1191typedef struct {
1192 struct {
1193 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1194 0->1->0->1... etc */
1195 __local short *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
1196 TTL_ext_short_tensor_t ext_tensor_in; /*!< The external tensor being input */
1197 TTL_ext_short_tensor_t ext_tensor_out; /*!< The external tensor being output */
1198 } common; ///< The information that is common to all pipeline schemes
1199
1202 // Cache previous gotten tiles.
1204 TTL_int_short_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
1206
1207/**
1208 * Simple declarations for file ordering purposes
1209 */
1210static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1211 TTL_simplex_const_short_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1212 TTL_tile_t tile_current_export);
1213
1214/**
1215 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
1216 *
1217 * @param int_base1 The address of the first buffer to be used in local memory
1218 * @param int_base2 The address of the second buffer to be used in local memory
1219 * @param int_base3 The address of the third buffer to be used in local memory
1220 * @param ext_tensor_in The external tensor to import the input data from
1221 * @param ext_tensor_out The external tensor to export the output data to
1222 * @param event_in A pointer to the event to use for the inward (external to
1223 * internal) transfer completion
1224 * @param event_out A pointer to the event to use for the inward (internal to
1225 * external) transfer completion
1226 * @param first_tile The first tile to fetch for the scheme
1227 *
1228 * Solid description of TTL_double_double_buffering_t buffering here
1229 *
1230 * @return The TTL_simplex_buffering_t created from the input parameters
1231 *
1232 * Example:
1233 * @code
1234 * TTL_event_t tb_e_in = TTL_get_event();
1235 * TTL_event_t tb_e_out = TTL_get_event();
1236 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1237 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1238 * ext_layout_out, &tb_e_in, &tb_e_out);
1239 * @endcode
1240 * \n
1241 *
1242 * This can be optimized and standardized using the TTL_step_buffering
1243 * call.
1244 *
1245 * @startuml
1246 *
1247 * start
1248 *
1249 *
1250 * stop
1251 *
1252 * @enduml
1253 *
1254 */
1256 __local short *int_base1, __local short *int_base2, __local short *int_base3, TTL_ext_short_tensor_t ext_tensor_in,
1257 TTL_ext_short_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
1259
1260 result.common.int_base[0] = int_base1;
1261 result.common.int_base[1] = int_base2;
1262 result.common.int_base[2] = int_base3;
1263 result.common.ext_tensor_in = ext_tensor_in;
1264 result.common.ext_tensor_out = ext_tensor_out;
1265 result.event_in = event_in;
1266 result.event_out = event_out;
1268
1269 result.common.index = 0;
1270
1272
1273 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1274
1275 return result;
1276}
1277
1278static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1279 TTL_simplex_const_short_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1280 TTL_tile_t tile_current_export) {
1281 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1282 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1283 const TTL_layout_t next_import_layout =
1284 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1285 const TTL_int_short_sub_tensor_t next_import_int_sub_tensor =
1286 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1287 tile_next_import.shape,
1288 next_import_layout,
1289 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1290 tile_next_import.offset);
1291 const TTL_const_ext_short_tensor_t next_import_ext_tensor =
1293 tile_next_import.shape,
1294 simplex_buffer->common.ext_tensor_in.layout,
1295 tile_next_import.offset,
1296 simplex_buffer->common.ext_tensor_in.elem_size);
1297
1298 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1299 simplex_buffer->next_exported_tile.shape.height);
1300 const TTL_int_short_tensor_t int_export_tensor =
1301 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1302 simplex_buffer->next_exported_tile.shape,
1303 int_export_layout,
1304 simplex_buffer->common.ext_tensor_out.elem_size);
1305 const TTL_ext_short_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
1306 simplex_buffer->next_exported_tile.shape,
1307 simplex_buffer->common.ext_tensor_out.layout,
1308 simplex_buffer->next_exported_tile.offset,
1309 simplex_buffer->common.ext_tensor_out.elem_size);
1310
1311 // Wait for the previous (import/export)s to complete before starting the next.
1312 TTL_wait(1, simplex_buffer->event_out);
1313 TTL_wait(1, simplex_buffer->event_in);
1314
1315 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1317 *TTL_to_void_tensor(&export_to),
1318 simplex_buffer->event_out);
1319
1320 if (TTL_tile_empty(tile_next_import) == false)
1321 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1322 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
1323 simplex_buffer->event_in);
1324
1325 // The import/export has been started for the current tile, Move to the next
1326 // tile.
1327 simplex_buffer->common.index =
1328 (simplex_buffer->common.index + 1) %
1329 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
1330
1331 // Retrieve buffer imported previously to read from now.
1332 const TTL_int_short_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
1333 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
1334
1335 // Can write to out buffer according to size of curr_tile, rather than size
1336 // recently exported.
1337 const TTL_layout_t curr_int_layout =
1338 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
1339 const TTL_int_short_sub_tensor_t int_curr_buff_out =
1340 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1341 tile_current_export.shape,
1342 curr_int_layout,
1343 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1344 tile_current_export.offset);
1345
1346 // Save last two tiles to prevent common repeated get_tile()'s.
1347 simplex_buffer->next_exported_tile = tile_current_export;
1348
1349 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
1350}
1351
1352static inline void __attribute__((overloadable)) TTL_finish_buffering(
1353 TTL_simplex_const_short_tensor_buffering_t *const simplex_buffering) {
1356}
1357/*
1358 * TTL_simplex_scheme.h
1359 *
1360 * Copyright (c) 2023 Mobileye
1361 *
1362 * Licensed under the Apache License, Version 2.0 (the License);
1363 * you may not use this file except in compliance with the License.
1364 * You may obtain a copy of the License at
1365 *
1366 * http://www.apache.org/licenses/LICENSE-2.0
1367 *
1368 * Unless required by applicable law or agreed to in writing, software
1369 * distributed under the License is distributed on an AS IS BASIS,
1370 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1371 * See the License for the specific language governing permissions and
1372 * limitations under the License.
1373 */
1374
1375// clang-format off
1376/**
1377 * @file
1378 *
1379 * TTL_simplex_buffering pipelines a pair of import and export transactions using
1380 * three internal buffers, in rotation: each buffer interchangeably serves as input
1381 * buffer and output buffer, such that in each iteration one buffer is used both to
1382 * export then import and two buffers are used by compute for reading and writing.
1383 *
1384 * With simplex buffering we're only waiting for previous iterations, so DMA
1385 * transactions run mostly in parallel to computation, but serially with each
1386 * other. Using the same buffer both for import and export is possible allowing us
1387 * to overlap exporting from and importing to the same buffer.
1388 *
1389 * The following table draws the pipelined actions performed in simplex buffering.
1390 * It specifies which tile is processed in each iteration:
1391 *
1392 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
1393 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
1394 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
1395 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
1396 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1397 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
1398 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1399 *
1400 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
1401 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
1402 *
1403 * @example TTL_simplex_buffering.cl
1404 */
1405// clang-format on
1406
1407// This file presumes that the following have been pre included.
1408// this is not done here for path reasons.
1409// #include "TTL_core.h"
1410// #include "TTL_import_export.h"
1411// #include TTL_IMPORT_EXPORT_INCLUDE_H
1412
1413/**
1414 * @def The structs used for this buffering type
1415 */
1416// TTL_simplex_buffering_t
1417typedef struct {
1418 struct {
1419 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1420 0->1->0->1... etc */
1421 __local ushort *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
1422 TTL_ext_ushort_tensor_t ext_tensor_in; /*!< The external tensor being input */
1423 TTL_ext_ushort_tensor_t ext_tensor_out; /*!< The external tensor being output */
1424 } common; ///< The information that is common to all pipeline schemes
1425
1428 // Cache previous gotten tiles.
1430 TTL_int_ushort_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
1432
1433/**
1434 * Simple declarations for file ordering purposes
1435 */
1436static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1437 TTL_simplex_const_ushort_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1438 TTL_tile_t tile_current_export);
1439
1440/**
1441 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
1442 *
1443 * @param int_base1 The address of the first buffer to be used in local memory
1444 * @param int_base2 The address of the second buffer to be used in local memory
1445 * @param int_base3 The address of the third buffer to be used in local memory
1446 * @param ext_tensor_in The external tensor to import the input data from
1447 * @param ext_tensor_out The external tensor to export the output data to
1448 * @param event_in A pointer to the event to use for the inward (external to
1449 * internal) transfer completion
1450 * @param event_out A pointer to the event to use for the inward (internal to
1451 * external) transfer completion
1452 * @param first_tile The first tile to fetch for the scheme
1453 *
1454 * Solid description of TTL_double_double_buffering_t buffering here
1455 *
1456 * @return The TTL_simplex_buffering_t created from the input parameters
1457 *
1458 * Example:
1459 * @code
1460 * TTL_event_t tb_e_in = TTL_get_event();
1461 * TTL_event_t tb_e_out = TTL_get_event();
1462 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1463 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1464 * ext_layout_out, &tb_e_in, &tb_e_out);
1465 * @endcode
1466 * \n
1467 *
1468 * This can be optimized and standardized using the TTL_step_buffering
1469 * call.
1470 *
1471 * @startuml
1472 *
1473 * start
1474 *
1475 *
1476 * stop
1477 *
1478 * @enduml
1479 *
1480 */
1482 __local ushort *int_base1, __local ushort *int_base2, __local ushort *int_base3,
1483 TTL_ext_ushort_tensor_t ext_tensor_in, TTL_ext_ushort_tensor_t ext_tensor_out, TTL_event_t *event_in,
1484 TTL_event_t *event_out, TTL_tile_t first_tile) {
1486
1487 result.common.int_base[0] = int_base1;
1488 result.common.int_base[1] = int_base2;
1489 result.common.int_base[2] = int_base3;
1490 result.common.ext_tensor_in = ext_tensor_in;
1491 result.common.ext_tensor_out = ext_tensor_out;
1492 result.event_in = event_in;
1493 result.event_out = event_out;
1495
1496 result.common.index = 0;
1497
1499
1500 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1501
1502 return result;
1503}
1504
1505static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1506 TTL_simplex_const_ushort_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1507 TTL_tile_t tile_current_export) {
1508 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1509 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1510 const TTL_layout_t next_import_layout =
1511 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1512 const TTL_int_ushort_sub_tensor_t next_import_int_sub_tensor =
1513 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1514 tile_next_import.shape,
1515 next_import_layout,
1516 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1517 tile_next_import.offset);
1518 const TTL_const_ext_ushort_tensor_t next_import_ext_tensor =
1520 tile_next_import.shape,
1521 simplex_buffer->common.ext_tensor_in.layout,
1522 tile_next_import.offset,
1523 simplex_buffer->common.ext_tensor_in.elem_size);
1524
1525 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1526 simplex_buffer->next_exported_tile.shape.height);
1527 const TTL_int_ushort_tensor_t int_export_tensor =
1528 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1529 simplex_buffer->next_exported_tile.shape,
1530 int_export_layout,
1531 simplex_buffer->common.ext_tensor_out.elem_size);
1533 simplex_buffer->next_exported_tile.shape,
1534 simplex_buffer->common.ext_tensor_out.layout,
1535 simplex_buffer->next_exported_tile.offset,
1536 simplex_buffer->common.ext_tensor_out.elem_size);
1537
1538 // Wait for the previous (import/export)s to complete before starting the next.
1539 TTL_wait(1, simplex_buffer->event_out);
1540 TTL_wait(1, simplex_buffer->event_in);
1541
1542 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1544 *TTL_to_void_tensor(&export_to),
1545 simplex_buffer->event_out);
1546
1547 if (TTL_tile_empty(tile_next_import) == false)
1548 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1549 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
1550 simplex_buffer->event_in);
1551
1552 // The import/export has been started for the current tile, Move to the next
1553 // tile.
1554 simplex_buffer->common.index =
1555 (simplex_buffer->common.index + 1) %
1556 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
1557
1558 // Retrieve buffer imported previously to read from now.
1559 const TTL_int_ushort_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
1560 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
1561
1562 // Can write to out buffer according to size of curr_tile, rather than size
1563 // recently exported.
1564 const TTL_layout_t curr_int_layout =
1565 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
1566 const TTL_int_ushort_sub_tensor_t int_curr_buff_out =
1567 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1568 tile_current_export.shape,
1569 curr_int_layout,
1570 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1571 tile_current_export.offset);
1572
1573 // Save last two tiles to prevent common repeated get_tile()'s.
1574 simplex_buffer->next_exported_tile = tile_current_export;
1575
1576 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
1577}
1578
1579static inline void __attribute__((overloadable)) TTL_finish_buffering(
1580 TTL_simplex_const_ushort_tensor_buffering_t *const simplex_buffering) {
1583}
1584/*
1585 * TTL_simplex_scheme.h
1586 *
1587 * Copyright (c) 2023 Mobileye
1588 *
1589 * Licensed under the Apache License, Version 2.0 (the License);
1590 * you may not use this file except in compliance with the License.
1591 * You may obtain a copy of the License at
1592 *
1593 * http://www.apache.org/licenses/LICENSE-2.0
1594 *
1595 * Unless required by applicable law or agreed to in writing, software
1596 * distributed under the License is distributed on an AS IS BASIS,
1597 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1598 * See the License for the specific language governing permissions and
1599 * limitations under the License.
1600 */
1601
1602// clang-format off
1603/**
1604 * @file
1605 *
1606 * TTL_simplex_buffering pipelines a pair of import and export transactions using
1607 * three internal buffers, in rotation: each buffer interchangeably serves as input
1608 * buffer and output buffer, such that in each iteration one buffer is used both to
1609 * export then import and two buffers are used by compute for reading and writing.
1610 *
1611 * With simplex buffering we're only waiting for previous iterations, so DMA
1612 * transactions run mostly in parallel to computation, but serially with each
1613 * other. Using the same buffer both for import and export is possible allowing us
1614 * to overlap exporting from and importing to the same buffer.
1615 *
1616 * The following table draws the pipelined actions performed in simplex buffering.
1617 * It specifies which tile is processed in each iteration:
1618 *
1619 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
1620 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
1621 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
1622 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
1623 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1624 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
1625 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1626 *
1627 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
1628 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
1629 *
1630 * @example TTL_simplex_buffering.cl
1631 */
1632// clang-format on
1633
1634// This file presumes that the following have been pre included.
1635// this is not done here for path reasons.
1636// #include "TTL_core.h"
1637// #include "TTL_import_export.h"
1638// #include TTL_IMPORT_EXPORT_INCLUDE_H
1639
1640/**
1641 * @def The structs used for this buffering type
1642 */
1643// TTL_simplex_buffering_t
1644typedef struct {
1645 struct {
1646 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1647 0->1->0->1... etc */
1648 __local long *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
1649 TTL_ext_long_tensor_t ext_tensor_in; /*!< The external tensor being input */
1650 TTL_ext_long_tensor_t ext_tensor_out; /*!< The external tensor being output */
1651 } common; ///< The information that is common to all pipeline schemes
1652
1655 // Cache previous gotten tiles.
1657 TTL_int_long_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
1659
1660/**
1661 * Simple declarations for file ordering purposes
1662 */
1663static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1664 TTL_simplex_const_long_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1665 TTL_tile_t tile_current_export);
1666
1667/**
1668 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
1669 *
1670 * @param int_base1 The address of the first buffer to be used in local memory
1671 * @param int_base2 The address of the second buffer to be used in local memory
1672 * @param int_base3 The address of the third buffer to be used in local memory
1673 * @param ext_tensor_in The external tensor to import the input data from
1674 * @param ext_tensor_out The external tensor to export the output data to
1675 * @param event_in A pointer to the event to use for the inward (external to
1676 * internal) transfer completion
1677 * @param event_out A pointer to the event to use for the inward (internal to
1678 * external) transfer completion
1679 * @param first_tile The first tile to fetch for the scheme
1680 *
1681 * Solid description of TTL_double_double_buffering_t buffering here
1682 *
1683 * @return The TTL_simplex_buffering_t created from the input parameters
1684 *
1685 * Example:
1686 * @code
1687 * TTL_event_t tb_e_in = TTL_get_event();
1688 * TTL_event_t tb_e_out = TTL_get_event();
1689 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1690 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1691 * ext_layout_out, &tb_e_in, &tb_e_out);
1692 * @endcode
1693 * \n
1694 *
1695 * This can be optimized and standardized using the TTL_step_buffering
1696 * call.
1697 *
1698 * @startuml
1699 *
1700 * start
1701 *
1702 *
1703 * stop
1704 *
1705 * @enduml
1706 *
1707 */
1709 __local long *int_base1, __local long *int_base2, __local long *int_base3, TTL_ext_long_tensor_t ext_tensor_in,
1710 TTL_ext_long_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
1712
1713 result.common.int_base[0] = int_base1;
1714 result.common.int_base[1] = int_base2;
1715 result.common.int_base[2] = int_base3;
1716 result.common.ext_tensor_in = ext_tensor_in;
1717 result.common.ext_tensor_out = ext_tensor_out;
1718 result.event_in = event_in;
1719 result.event_out = event_out;
1721
1722 result.common.index = 0;
1723
1725
1726 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1727
1728 return result;
1729}
1730
1731static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1732 TTL_simplex_const_long_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1733 TTL_tile_t tile_current_export) {
1734 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1735 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1736 const TTL_layout_t next_import_layout =
1737 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1738 const TTL_int_long_sub_tensor_t next_import_int_sub_tensor =
1739 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1740 tile_next_import.shape,
1741 next_import_layout,
1742 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1743 tile_next_import.offset);
1744 const TTL_const_ext_long_tensor_t next_import_ext_tensor =
1746 tile_next_import.shape,
1747 simplex_buffer->common.ext_tensor_in.layout,
1748 tile_next_import.offset,
1749 simplex_buffer->common.ext_tensor_in.elem_size);
1750
1751 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1752 simplex_buffer->next_exported_tile.shape.height);
1753 const TTL_int_long_tensor_t int_export_tensor =
1754 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1755 simplex_buffer->next_exported_tile.shape,
1756 int_export_layout,
1757 simplex_buffer->common.ext_tensor_out.elem_size);
1758 const TTL_ext_long_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
1759 simplex_buffer->next_exported_tile.shape,
1760 simplex_buffer->common.ext_tensor_out.layout,
1761 simplex_buffer->next_exported_tile.offset,
1762 simplex_buffer->common.ext_tensor_out.elem_size);
1763
1764 // Wait for the previous (import/export)s to complete before starting the next.
1765 TTL_wait(1, simplex_buffer->event_out);
1766 TTL_wait(1, simplex_buffer->event_in);
1767
1768 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1770 *TTL_to_void_tensor(&export_to),
1771 simplex_buffer->event_out);
1772
1773 if (TTL_tile_empty(tile_next_import) == false)
1774 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1775 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
1776 simplex_buffer->event_in);
1777
1778 // The import/export has been started for the current tile, Move to the next
1779 // tile.
1780 simplex_buffer->common.index =
1781 (simplex_buffer->common.index + 1) %
1782 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
1783
1784 // Retrieve buffer imported previously to read from now.
1785 const TTL_int_long_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
1786 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
1787
1788 // Can write to out buffer according to size of curr_tile, rather than size
1789 // recently exported.
1790 const TTL_layout_t curr_int_layout =
1791 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
1792 const TTL_int_long_sub_tensor_t int_curr_buff_out =
1793 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1794 tile_current_export.shape,
1795 curr_int_layout,
1796 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1797 tile_current_export.offset);
1798
1799 // Save last two tiles to prevent common repeated get_tile()'s.
1800 simplex_buffer->next_exported_tile = tile_current_export;
1801
1802 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
1803}
1804
1805static inline void __attribute__((overloadable)) TTL_finish_buffering(
1806 TTL_simplex_const_long_tensor_buffering_t *const simplex_buffering) {
1809}
1810/*
1811 * TTL_simplex_scheme.h
1812 *
1813 * Copyright (c) 2023 Mobileye
1814 *
1815 * Licensed under the Apache License, Version 2.0 (the License);
1816 * you may not use this file except in compliance with the License.
1817 * You may obtain a copy of the License at
1818 *
1819 * http://www.apache.org/licenses/LICENSE-2.0
1820 *
1821 * Unless required by applicable law or agreed to in writing, software
1822 * distributed under the License is distributed on an AS IS BASIS,
1823 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1824 * See the License for the specific language governing permissions and
1825 * limitations under the License.
1826 */
1827
1828// clang-format off
1829/**
1830 * @file
1831 *
1832 * TTL_simplex_buffering pipelines a pair of import and export transactions using
1833 * three internal buffers, in rotation: each buffer interchangeably serves as input
1834 * buffer and output buffer, such that in each iteration one buffer is used both to
1835 * export then import and two buffers are used by compute for reading and writing.
1836 *
1837 * With simplex buffering we're only waiting for previous iterations, so DMA
1838 * transactions run mostly in parallel to computation, but serially with each
1839 * other. Using the same buffer both for import and export is possible allowing us
1840 * to overlap exporting from and importing to the same buffer.
1841 *
1842 * The following table draws the pipelined actions performed in simplex buffering.
1843 * It specifies which tile is processed in each iteration:
1844 *
1845 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
1846 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
1847 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
1848 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
1849 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1850 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
1851 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1852 *
1853 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
1854 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
1855 *
1856 * @example TTL_simplex_buffering.cl
1857 */
1858// clang-format on
1859
1860// This file presumes that the following have been pre included.
1861// this is not done here for path reasons.
1862// #include "TTL_core.h"
1863// #include "TTL_import_export.h"
1864// #include TTL_IMPORT_EXPORT_INCLUDE_H
1865
1866/**
1867 * @def The structs used for this buffering type
1868 */
1869// TTL_simplex_buffering_t
1870typedef struct {
1871 struct {
1872 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1873 0->1->0->1... etc */
1874 __local ulong *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
1875 TTL_ext_ulong_tensor_t ext_tensor_in; /*!< The external tensor being input */
1876 TTL_ext_ulong_tensor_t ext_tensor_out; /*!< The external tensor being output */
1877 } common; ///< The information that is common to all pipeline schemes
1878
1881 // Cache previous gotten tiles.
1883 TTL_int_ulong_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
1885
1886/**
1887 * Simple declarations for file ordering purposes
1888 */
1889static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1890 TTL_simplex_const_ulong_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1891 TTL_tile_t tile_current_export);
1892
1893/**
1894 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
1895 *
1896 * @param int_base1 The address of the first buffer to be used in local memory
1897 * @param int_base2 The address of the second buffer to be used in local memory
1898 * @param int_base3 The address of the third buffer to be used in local memory
1899 * @param ext_tensor_in The external tensor to import the input data from
1900 * @param ext_tensor_out The external tensor to export the output data to
1901 * @param event_in A pointer to the event to use for the inward (external to
1902 * internal) transfer completion
1903 * @param event_out A pointer to the event to use for the inward (internal to
1904 * external) transfer completion
1905 * @param first_tile The first tile to fetch for the scheme
1906 *
1907 * Solid description of TTL_double_double_buffering_t buffering here
1908 *
1909 * @return The TTL_simplex_buffering_t created from the input parameters
1910 *
1911 * Example:
1912 * @code
1913 * TTL_event_t tb_e_in = TTL_get_event();
1914 * TTL_event_t tb_e_out = TTL_get_event();
1915 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1916 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1917 * ext_layout_out, &tb_e_in, &tb_e_out);
1918 * @endcode
1919 * \n
1920 *
1921 * This can be optimized and standardized using the TTL_step_buffering
1922 * call.
1923 *
1924 * @startuml
1925 *
1926 * start
1927 *
1928 *
1929 * stop
1930 *
1931 * @enduml
1932 *
1933 */
1935 __local ulong *int_base1, __local ulong *int_base2, __local ulong *int_base3, TTL_ext_ulong_tensor_t ext_tensor_in,
1936 TTL_ext_ulong_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
1938
1939 result.common.int_base[0] = int_base1;
1940 result.common.int_base[1] = int_base2;
1941 result.common.int_base[2] = int_base3;
1942 result.common.ext_tensor_in = ext_tensor_in;
1943 result.common.ext_tensor_out = ext_tensor_out;
1944 result.event_in = event_in;
1945 result.event_out = event_out;
1947
1948 result.common.index = 0;
1949
1951
1952 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1953
1954 return result;
1955}
1956
1957static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1958 TTL_simplex_const_ulong_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1959 TTL_tile_t tile_current_export) {
1960 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1961 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1962 const TTL_layout_t next_import_layout =
1963 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1964 const TTL_int_ulong_sub_tensor_t next_import_int_sub_tensor =
1965 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1966 tile_next_import.shape,
1967 next_import_layout,
1968 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1969 tile_next_import.offset);
1970 const TTL_const_ext_ulong_tensor_t next_import_ext_tensor =
1972 tile_next_import.shape,
1973 simplex_buffer->common.ext_tensor_in.layout,
1974 tile_next_import.offset,
1975 simplex_buffer->common.ext_tensor_in.elem_size);
1976
1977 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1978 simplex_buffer->next_exported_tile.shape.height);
1979 const TTL_int_ulong_tensor_t int_export_tensor =
1980 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1981 simplex_buffer->next_exported_tile.shape,
1982 int_export_layout,
1983 simplex_buffer->common.ext_tensor_out.elem_size);
1984 const TTL_ext_ulong_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
1985 simplex_buffer->next_exported_tile.shape,
1986 simplex_buffer->common.ext_tensor_out.layout,
1987 simplex_buffer->next_exported_tile.offset,
1988 simplex_buffer->common.ext_tensor_out.elem_size);
1989
1990 // Wait for the previous (import/export)s to complete before starting the next.
1991 TTL_wait(1, simplex_buffer->event_out);
1992 TTL_wait(1, simplex_buffer->event_in);
1993
1994 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1996 *TTL_to_void_tensor(&export_to),
1997 simplex_buffer->event_out);
1998
1999 if (TTL_tile_empty(tile_next_import) == false)
2000 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
2001 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
2002 simplex_buffer->event_in);
2003
2004 // The import/export has been started for the current tile, Move to the next
2005 // tile.
2006 simplex_buffer->common.index =
2007 (simplex_buffer->common.index + 1) %
2008 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
2009
2010 // Retrieve buffer imported previously to read from now.
2011 const TTL_int_ulong_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
2012 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
2013
2014 // Can write to out buffer according to size of curr_tile, rather than size
2015 // recently exported.
2016 const TTL_layout_t curr_int_layout =
2017 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
2018 const TTL_int_ulong_sub_tensor_t int_curr_buff_out =
2019 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
2020 tile_current_export.shape,
2021 curr_int_layout,
2022 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
2023 tile_current_export.offset);
2024
2025 // Save last two tiles to prevent common repeated get_tile()'s.
2026 simplex_buffer->next_exported_tile = tile_current_export;
2027
2028 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
2029}
2030
2031static inline void __attribute__((overloadable)) TTL_finish_buffering(
2032 TTL_simplex_const_ulong_tensor_buffering_t *const simplex_buffering) {
2035}
static TTL_ext_void_tensor_t TTL_create_ext_tensor(__global void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
const and non-const tensor creation functions.
static const TTL_const_ext_void_tensor_t * TTL_to_const_tensor(const TTL_ext_void_tensor_t *const tensor)
static TTL_int_void_tensor_t TTL_create_int_tensor(__local void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
const and non-const tensor creation functions.
static TTL_int_void_sub_tensor_t TTL_create_int_sub_tensor(__local void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_dim_t elem_size, const TTL_offset_t offset, const TTL_shape_t origin_shape, TTL_offset_t origin_offset)
const and non-const sub tensor creation functions.
static const TTL_ext_void_tensor_t * TTL_to_void_tensor(const TTL_ext_void_tensor_t *tensor)
static TTL_int_void_sub_tensor_t TTL_create_empty_int_sub_tensor(__local void *unused)
static const TTL_ext_void_sub_tensor_t * TTL_to_void_sub_tensor(const TTL_ext_void_sub_tensor_t *tensor)
static TTL_const_ext_void_tensor_t TTL_create_const_ext_tensor(__global const void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
create TTL_create_int_tensor_impl
static TTL_io_void_tensor_t TTL_create_io_tensors(TTL_int_void_sub_tensor_t imported_to, TTL_int_void_sub_tensor_t to_export_from)
Create a TTL_io_tensors_t from a pair of tensors.
static TTL_io_void_tensor_t TTL_step_buffering(TTL_simplex_const_void_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import, TTL_tile_t tile_current_export)
static void TTL_finish_buffering(TTL_simplex_const_void_tensor_buffering_t *const simplex_buffering)
static TTL_simplex_const_void_tensor_buffering_t TTL_start_simplex_buffering(__local void *int_base1, __local void *int_base2, __local void *int_base3, TTL_ext_void_tensor_t ext_tensor_in, TTL_ext_void_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile)
Create a TTL_simplex_buffering_t and begin the buffering process.
static TTL_layout_t TTL_create_layout(void)
Create a 1D Description of a Tensor layout in memory.
static int TTL_tile_empty(TTL_tile_t tile)
Check if the tile passed is empty.
Definition TTL_tiles.h:257
static TTL_tile_t TTL_create_empty_tile()
Create an empty tile. Empty means it has all dimensions set to zero.
Definition TTL_tiles.h:267
static void TTL_import_sub_tensor(const TTL_int_void_sub_tensor_t internal_sub_tensor, const TTL_const_ext_void_tensor_t const_external_tensor, TTL_event_t *event)
Implementation of TTL_import_sub_tensor.
static void TTL_export(const TTL_const_int_void_tensor_t internal_tensor, const TTL_ext_void_tensor_t external_tensor, TTL_event_t *event)
Export the external tensor to the internal tensor returning when complete.
event_t TTL_event_t
TTL_event_t is a pseudonym for OpenCL event_t.
#define __local
The opencl __local namespace is not supported in C.
Definition c/TTL_types.h:27
unsigned char uchar
opencl and so TTL supports a type called uchar which is not part of C
Definition c/TTL_types.h:25
unsigned long ulong
OpenCL supports ulong so provide the same in c.
Definition c/TTL_types.h:32
unsigned int uint
OpenCL supports uint so provide the same in c.
Definition c/TTL_types.h:30
unsigned short ushort
OpenCL supports ushort so provide the same in c.
Definition c/TTL_types.h:31
static void TTL_wait(const int num_events, TTL_event_t *const events)
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Description of a Tensor layout in memory.
TTL_dim_t width
Number of elements along dimension x.
TTL_dim_t height
Number of rows along dimension y.
struct TTL_simplex_const_char_tensor_buffering_t::@151364276074235233353215211004324345371371234176 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_int_tensor_buffering_t::@253313373052213322053276016274213340172365001033 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_long_tensor_buffering_t::@127064357345325112075046301011065216033014104341 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_short_tensor_buffering_t::@115272321114137123101225013316021153121033257075 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_uchar_tensor_buffering_t::@223221137317077266265375173261160253237074234026 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_uint_tensor_buffering_t::@060015144121104020011051356200213205333132324000 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_ulong_tensor_buffering_t::@040214027170156123057066024141166173057051310055 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_ushort_tensor_buffering_t::@174153011163350113115334035274116027242307070323 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_void_tensor_buffering_t::@216377303123235236211140053033115236170055361077 common
The information that is common to all pipeline schemes.
TTL_offset_t offset
Definition TTL_tiles.h:126
TTL_shape_t shape
Definition TTL_tiles.h:125