Tensor Tiling Library
 
Loading...
Searching...
No Matches
pipelines/TTL_duplex_scheme.h
Go to the documentation of this file.
1/*
2 * TTL_duplex_scheme.h
3 *
4 * Copyright (c) 2025 Mobileye
5 *
6 * Licensed under the Apache License, Version 2.0 (the License);
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19// clang-format off
20/**
21 * @file
22 *
23 * Given pair of blocking import and export that can execute concurrently,
24 * TTL_duplex_buffering issues them together and then waits on both to complete,
25 * hopefully executing them in parallel to each other. This scheme uses two
26 * internal buffers, one for the import and one for the export. Note that the
27 * export is pipelined to pair the import of the current tile with the export of
28 * previous tile.
29
30 * The following table draws the pipelined actions performed in duplex buffering.
31 * It specifies which tile is processed in each iteration:
32 *
33 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
34 * |-------------------|-----|-----|----------------------|---------------|
35 * | **Import** | 0 | 1 | i | |
36 * | **Wait Import** | 0 | 1 | i | |
37 * | **Compute** | 0 | 1 | i | |
38 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
39 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
40 *
41 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
42 *
43 * When including this file the following must be defined
44 *
45 * #define TTL_TENSOR_TYPE void
46 * #define TTL_TENSOR_TYPE uchar
47 * etc
48 *
49 * @example TTL_duplex_buffering.cl
50 */
51// clang-format on
52
53// This file presumes that the following have been pre included.
54// this is not done here for path reasons.
55// #include "TTL_core.h"
56// #include "TTL_import_export.h"
57// #include TTL_IMPORT_EXPORT_INCLUDE_H
58
59/**
60 * @def The structs used for this buffering type
61 */
62
63/**
64 * @brief Data required to perform duplex buffer pipelining.
65 *
66 * @see TTL_start_duplex_buffering for a description of duplex buffer
67 * pipelining.
68 */
69typedef struct {
70 struct {
71 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
72 0->1->0->1... etc */
73 __local void *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
74 TTL_ext_void_tensor_t ext_tensor_in; /*!< The external tensor being input */
75 TTL_ext_void_tensor_t ext_tensor_out; /*!< The external tensor being output */
76 } common; ///< The information that is common to all pipeline schemes
77
78 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
79 ///< external to internal transfers, the second for
80 ///< internal to external transfers
81
82 /**
83 * @brief Store of the buffers used for the previous import/export cycles.
84 *
85 */
86 struct {
89 } prev_out_tensors;
91
92/*
93 * Predeclare TTL_step_buffering.
94 */
95static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(
96 TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
97 TTL_tile_t tile_current_export);
98
99/**
100 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
101 *
102 * @param ext_tensor_in A tensor describing the input in global memory
103 * @param int_base_in The address of the local import buffer.
104 * @param ext_tensor_out A tensor describing the output in global memory
105 * @param int_base_out The address of the local export buffer.
106 * @param events A pointer to a list of 2 events.
107 * The first event in the list will be used for imports, the second event in
108 * the list will be used for exports.
109 * @param first_tile The first tile to fetch for the scheme
110 *
111 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
112 *
113 * The first event in the list will be used for imports,
114 * the second event in the list will be used for exports.
115 * \n\n Example:
116 * @code
117 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
118 *
119 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
120 * ext_base_in, ext_layout_in, l_buffers[0],
121 * ext_base_out, ext_layout_out, l_buffers[1],
122 * &events);
123 * @endcode
124 * \n
125 *
126 * @return The TTL_duplex_buffering_t created from the input parameters.
127 *
128 * Solid description of duplex buffering here.
129 *
130 * The simplest form of duplex buffering takes the following flow.
131 *
132 * @startuml
133 *
134 * start
135 *
136 * :Create a TTL_tiler_t with TTL_create_tiler;
137 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
138 * 1 input buffer, 1 output buffer;
139 * :NumberOfTiles = TTL_number_of_tiles(tiler);
140 *
141 * while (for each tile)
142 *
143 * :Import The Next Tile into the input buffer;
144 *
145 * :Process the Tile from the input buffer to the output buffer;
146 *
147 * :ExportThe Process Tile from into the output buffer;
148 *
149 * endwhile
150 *
151 * stop
152 *
153 * @enduml
154 *
155 * This can be optimized and standardized using the TTL_step_buffering
156 * call.
157 *
158 * @startuml
159 *
160 * start
161 *
162 * :Create a TTL_tiler_t with TTL_create_tiler;
163 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
164 * :NumberOfTiles = TTL_number_of_tiles(tiler);
165 *
166 * while (for each tile)
167 *
168 * :Call TTL_step_buffering for the current tile
169 *
170 * This will import the current new tile and export the last tile
171 * in parallel;
172 *
173 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
174 * :Process the Tile from the input buffer to the output buffer;
175 * endif
176 *
177 * endwhile
178 *
179 * stop
180 *
181 * @enduml
182 */
184 TTL_ext_void_tensor_t ext_tensor_in, __local void *int_base_in, TTL_ext_void_tensor_t ext_tensor_out,
185 __local void *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
187 result.common.int_base[0] = int_base_in;
188 result.common.int_base[1] = int_base_out;
189
190 result.common.ext_tensor_in = ext_tensor_in;
191 result.common.ext_tensor_out = ext_tensor_out;
192 result.events = events;
195
196 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
197
198 return result;
199}
200
201static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(
202 TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
203 TTL_tile_t tile_current_export) {
204 const TTL_layout_t next_import_layout =
205 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
206 const TTL_const_ext_void_tensor_t next_import_ext_tensor =
208 tile_current_import.shape,
209 duplex_buffering->common.ext_tensor_in.layout,
210 tile_current_import.offset,
211 duplex_buffering->common.ext_tensor_in.elem_size);
212 const TTL_int_void_sub_tensor_t next_import_int_sub_tensor =
213 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
214 tile_current_import.shape,
215 next_import_layout,
216 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
217 tile_current_import.offset);
218
219 const TTL_const_int_void_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
220 const TTL_ext_void_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
221
222 if (TTL_tile_empty(tile_current_import) == false)
223 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
224 *TTL_to_void_tensor(&next_import_ext_tensor),
225 &(*duplex_buffering->events)[0]);
226
227 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
228 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
229 *TTL_to_void_tensor(&next_export_ext_tensor),
230 &(*duplex_buffering->events)[1]);
231
232 const TTL_layout_t int_export_layout =
233 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
234 const TTL_ext_void_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
235 tile_current_export.shape,
236 duplex_buffering->common.ext_tensor_out.layout,
237 tile_current_export.offset,
238 duplex_buffering->common.ext_tensor_out.elem_size);
239 const TTL_int_void_sub_tensor_t to_export_from =
240 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
241 tile_current_export.shape,
242 int_export_layout,
243 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
244 tile_current_export.offset);
245
246 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
247 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
248
249 TTL_wait(2, *duplex_buffering->events);
250
251 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
252}
253
254static inline void __attribute__((overloadable)) TTL_finish_buffering(
255 TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering) {
257}
258/*
259 * TTL_duplex_scheme.h
260 *
261 * Copyright (c) 2025 Mobileye
262 *
263 * Licensed under the Apache License, Version 2.0 (the License);
264 * you may not use this file except in compliance with the License.
265 * You may obtain a copy of the License at
266 *
267 * http://www.apache.org/licenses/LICENSE-2.0
268 *
269 * Unless required by applicable law or agreed to in writing, software
270 * distributed under the License is distributed on an AS IS BASIS,
271 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
272 * See the License for the specific language governing permissions and
273 * limitations under the License.
274 */
275
276// clang-format off
277/**
278 * @file
279 *
280 * Given pair of blocking import and export that can execute concurrently,
281 * TTL_duplex_buffering issues them together and then waits on both to complete,
282 * hopefully executing them in parallel to each other. This scheme uses two
283 * internal buffers, one for the import and one for the export. Note that the
284 * export is pipelined to pair the import of the current tile with the export of
285 * previous tile.
286
287 * The following table draws the pipelined actions performed in duplex buffering.
288 * It specifies which tile is processed in each iteration:
289 *
290 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
291 * |-------------------|-----|-----|----------------------|---------------|
292 * | **Import** | 0 | 1 | i | |
293 * | **Wait Import** | 0 | 1 | i | |
294 * | **Compute** | 0 | 1 | i | |
295 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
296 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
297 *
298 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
299 *
300 * When including this file the following must be defined
301 *
302 * #define TTL_TENSOR_TYPE void
303 * #define TTL_TENSOR_TYPE uchar
304 * etc
305 *
306 * @example TTL_duplex_buffering.cl
307 */
308// clang-format on
309
310// This file presumes that the following have been pre included.
311// this is not done here for path reasons.
312// #include "TTL_core.h"
313// #include "TTL_import_export.h"
314// #include TTL_IMPORT_EXPORT_INCLUDE_H
315
316/**
317 * @def The structs used for this buffering type
318 */
319
320/**
321 * @brief Data required to perform duplex buffer pipelining.
322 *
323 * @see TTL_start_duplex_buffering for a description of duplex buffer
324 * pipelining.
325 */
326typedef struct {
327 struct {
328 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
329 0->1->0->1... etc */
330 __local char *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
331 TTL_ext_char_tensor_t ext_tensor_in; /*!< The external tensor being input */
332 TTL_ext_char_tensor_t ext_tensor_out; /*!< The external tensor being output */
333 } common; ///< The information that is common to all pipeline schemes
334
335 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
336 ///< external to internal transfers, the second for
337 ///< internal to external transfers
338
339 /**
340 * @brief Store of the buffers used for the previous import/export cycles.
341 *
342 */
343 struct {
346 } prev_out_tensors;
348
349/*
350 * Predeclare TTL_step_buffering.
351 */
352static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(
353 TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
354 TTL_tile_t tile_current_export);
355
356/**
357 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
358 *
359 * @param ext_tensor_in A tensor describing the input in global memory
360 * @param int_base_in The address of the local import buffer.
361 * @param ext_tensor_out A tensor describing the output in global memory
362 * @param int_base_out The address of the local export buffer.
363 * @param events A pointer to a list of 2 events.
364 * The first event in the list will be used for imports, the second event in
365 * the list will be used for exports.
366 * @param first_tile The first tile to fetch for the scheme
367 *
368 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
369 *
370 * The first event in the list will be used for imports,
371 * the second event in the list will be used for exports.
372 * \n\n Example:
373 * @code
374 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
375 *
376 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
377 * ext_base_in, ext_layout_in, l_buffers[0],
378 * ext_base_out, ext_layout_out, l_buffers[1],
379 * &events);
380 * @endcode
381 * \n
382 *
383 * @return The TTL_duplex_buffering_t created from the input parameters.
384 *
385 * Solid description of duplex buffering here.
386 *
387 * The simplest form of duplex buffering takes the following flow.
388 *
389 * @startuml
390 *
391 * start
392 *
393 * :Create a TTL_tiler_t with TTL_create_tiler;
394 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
395 * 1 input buffer, 1 output buffer;
396 * :NumberOfTiles = TTL_number_of_tiles(tiler);
397 *
398 * while (for each tile)
399 *
400 * :Import The Next Tile into the input buffer;
401 *
402 * :Process the Tile from the input buffer to the output buffer;
403 *
404 * :ExportThe Process Tile from into the output buffer;
405 *
406 * endwhile
407 *
408 * stop
409 *
410 * @enduml
411 *
412 * This can be optimized and standardized using the TTL_step_buffering
413 * call.
414 *
415 * @startuml
416 *
417 * start
418 *
419 * :Create a TTL_tiler_t with TTL_create_tiler;
420 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
421 * :NumberOfTiles = TTL_number_of_tiles(tiler);
422 *
423 * while (for each tile)
424 *
425 * :Call TTL_step_buffering for the current tile
426 *
427 * This will import the current new tile and export the last tile
428 * in parallel;
429 *
430 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
431 * :Process the Tile from the input buffer to the output buffer;
432 * endif
433 *
434 * endwhile
435 *
436 * stop
437 *
438 * @enduml
439 */
441 TTL_ext_char_tensor_t ext_tensor_in, __local char *int_base_in, TTL_ext_char_tensor_t ext_tensor_out,
442 __local char *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
444 result.common.int_base[0] = int_base_in;
445 result.common.int_base[1] = int_base_out;
446
447 result.common.ext_tensor_in = ext_tensor_in;
448 result.common.ext_tensor_out = ext_tensor_out;
449 result.events = events;
452
453 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
454
455 return result;
456}
457
458static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(
459 TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
460 TTL_tile_t tile_current_export) {
461 const TTL_layout_t next_import_layout =
462 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
463 const TTL_const_ext_char_tensor_t next_import_ext_tensor =
465 tile_current_import.shape,
466 duplex_buffering->common.ext_tensor_in.layout,
467 tile_current_import.offset,
468 duplex_buffering->common.ext_tensor_in.elem_size);
469 const TTL_int_char_sub_tensor_t next_import_int_sub_tensor =
470 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
471 tile_current_import.shape,
472 next_import_layout,
473 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
474 tile_current_import.offset);
475
476 const TTL_const_int_char_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
477 const TTL_ext_char_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
478
479 if (TTL_tile_empty(tile_current_import) == false)
480 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
481 *TTL_to_void_tensor(&next_import_ext_tensor),
482 &(*duplex_buffering->events)[0]);
483
484 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
485 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
486 *TTL_to_void_tensor(&next_export_ext_tensor),
487 &(*duplex_buffering->events)[1]);
488
489 const TTL_layout_t int_export_layout =
490 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
491 const TTL_ext_char_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
492 tile_current_export.shape,
493 duplex_buffering->common.ext_tensor_out.layout,
494 tile_current_export.offset,
495 duplex_buffering->common.ext_tensor_out.elem_size);
496 const TTL_int_char_sub_tensor_t to_export_from =
497 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
498 tile_current_export.shape,
499 int_export_layout,
500 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
501 tile_current_export.offset);
502
503 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
504 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
505
506 TTL_wait(2, *duplex_buffering->events);
507
508 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
509}
510
511static inline void __attribute__((overloadable)) TTL_finish_buffering(
512 TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering) {
514}
515/*
516 * TTL_duplex_scheme.h
517 *
518 * Copyright (c) 2025 Mobileye
519 *
520 * Licensed under the Apache License, Version 2.0 (the License);
521 * you may not use this file except in compliance with the License.
522 * You may obtain a copy of the License at
523 *
524 * http://www.apache.org/licenses/LICENSE-2.0
525 *
526 * Unless required by applicable law or agreed to in writing, software
527 * distributed under the License is distributed on an AS IS BASIS,
528 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
529 * See the License for the specific language governing permissions and
530 * limitations under the License.
531 */
532
533// clang-format off
534/**
535 * @file
536 *
537 * Given pair of blocking import and export that can execute concurrently,
538 * TTL_duplex_buffering issues them together and then waits on both to complete,
539 * hopefully executing them in parallel to each other. This scheme uses two
540 * internal buffers, one for the import and one for the export. Note that the
541 * export is pipelined to pair the import of the current tile with the export of
542 * previous tile.
543
544 * The following table draws the pipelined actions performed in duplex buffering.
545 * It specifies which tile is processed in each iteration:
546 *
547 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
548 * |-------------------|-----|-----|----------------------|---------------|
549 * | **Import** | 0 | 1 | i | |
550 * | **Wait Import** | 0 | 1 | i | |
551 * | **Compute** | 0 | 1 | i | |
552 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
553 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
554 *
555 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
556 *
557 * When including this file the following must be defined
558 *
559 * #define TTL_TENSOR_TYPE void
560 * #define TTL_TENSOR_TYPE uchar
561 * etc
562 *
563 * @example TTL_duplex_buffering.cl
564 */
565// clang-format on
566
567// This file presumes that the following have been pre included.
568// this is not done here for path reasons.
569// #include "TTL_core.h"
570// #include "TTL_import_export.h"
571// #include TTL_IMPORT_EXPORT_INCLUDE_H
572
573/**
574 * @def The structs used for this buffering type
575 */
576
577/**
578 * @brief Data required to perform duplex buffer pipelining.
579 *
580 * @see TTL_start_duplex_buffering for a description of duplex buffer
581 * pipelining.
582 */
583typedef struct {
584 struct {
585 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
586 0->1->0->1... etc */
587 __local uchar *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
588 TTL_ext_uchar_tensor_t ext_tensor_in; /*!< The external tensor being input */
589 TTL_ext_uchar_tensor_t ext_tensor_out; /*!< The external tensor being output */
590 } common; ///< The information that is common to all pipeline schemes
591
592 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
593 ///< external to internal transfers, the second for
594 ///< internal to external transfers
595
596 /**
597 * @brief Store of the buffers used for the previous import/export cycles.
598 *
599 */
600 struct {
603 } prev_out_tensors;
605
606/*
607 * Predeclare TTL_step_buffering.
608 */
609static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(
610 TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
611 TTL_tile_t tile_current_export);
612
613/**
614 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
615 *
616 * @param ext_tensor_in A tensor describing the input in global memory
617 * @param int_base_in The address of the local import buffer.
618 * @param ext_tensor_out A tensor describing the output in global memory
619 * @param int_base_out The address of the local export buffer.
620 * @param events A pointer to a list of 2 events.
621 * The first event in the list will be used for imports, the second event in
622 * the list will be used for exports.
623 * @param first_tile The first tile to fetch for the scheme
624 *
625 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
626 *
627 * The first event in the list will be used for imports,
628 * the second event in the list will be used for exports.
629 * \n\n Example:
630 * @code
631 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
632 *
633 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
634 * ext_base_in, ext_layout_in, l_buffers[0],
635 * ext_base_out, ext_layout_out, l_buffers[1],
636 * &events);
637 * @endcode
638 * \n
639 *
640 * @return The TTL_duplex_buffering_t created from the input parameters.
641 *
642 * Solid description of duplex buffering here.
643 *
644 * The simplest form of duplex buffering takes the following flow.
645 *
646 * @startuml
647 *
648 * start
649 *
650 * :Create a TTL_tiler_t with TTL_create_tiler;
651 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
652 * 1 input buffer, 1 output buffer;
653 * :NumberOfTiles = TTL_number_of_tiles(tiler);
654 *
655 * while (for each tile)
656 *
657 * :Import The Next Tile into the input buffer;
658 *
659 * :Process the Tile from the input buffer to the output buffer;
660 *
661 * :ExportThe Process Tile from into the output buffer;
662 *
663 * endwhile
664 *
665 * stop
666 *
667 * @enduml
668 *
669 * This can be optimized and standardized using the TTL_step_buffering
670 * call.
671 *
672 * @startuml
673 *
674 * start
675 *
676 * :Create a TTL_tiler_t with TTL_create_tiler;
677 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
678 * :NumberOfTiles = TTL_number_of_tiles(tiler);
679 *
680 * while (for each tile)
681 *
682 * :Call TTL_step_buffering for the current tile
683 *
684 * This will import the current new tile and export the last tile
685 * in parallel;
686 *
687 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
688 * :Process the Tile from the input buffer to the output buffer;
689 * endif
690 *
691 * endwhile
692 *
693 * stop
694 *
695 * @enduml
696 */
698 TTL_ext_uchar_tensor_t ext_tensor_in, __local uchar *int_base_in, TTL_ext_uchar_tensor_t ext_tensor_out,
699 __local uchar *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
701 result.common.int_base[0] = int_base_in;
702 result.common.int_base[1] = int_base_out;
703
704 result.common.ext_tensor_in = ext_tensor_in;
705 result.common.ext_tensor_out = ext_tensor_out;
706 result.events = events;
709
710 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
711
712 return result;
713}
714
715static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(
716 TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
717 TTL_tile_t tile_current_export) {
718 const TTL_layout_t next_import_layout =
719 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
720 const TTL_const_ext_uchar_tensor_t next_import_ext_tensor =
722 tile_current_import.shape,
723 duplex_buffering->common.ext_tensor_in.layout,
724 tile_current_import.offset,
725 duplex_buffering->common.ext_tensor_in.elem_size);
726 const TTL_int_uchar_sub_tensor_t next_import_int_sub_tensor =
727 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
728 tile_current_import.shape,
729 next_import_layout,
730 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
731 tile_current_import.offset);
732
733 const TTL_const_int_uchar_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
734 const TTL_ext_uchar_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
735
736 if (TTL_tile_empty(tile_current_import) == false)
737 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
738 *TTL_to_void_tensor(&next_import_ext_tensor),
739 &(*duplex_buffering->events)[0]);
740
741 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
742 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
743 *TTL_to_void_tensor(&next_export_ext_tensor),
744 &(*duplex_buffering->events)[1]);
745
746 const TTL_layout_t int_export_layout =
747 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
748 const TTL_ext_uchar_tensor_t to_export_to =
750 tile_current_export.shape,
751 duplex_buffering->common.ext_tensor_out.layout,
752 tile_current_export.offset,
753 duplex_buffering->common.ext_tensor_out.elem_size);
754 const TTL_int_uchar_sub_tensor_t to_export_from =
755 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
756 tile_current_export.shape,
757 int_export_layout,
758 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
759 tile_current_export.offset);
760
761 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
762 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
763
764 TTL_wait(2, *duplex_buffering->events);
765
766 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
767}
768
769static inline void __attribute__((overloadable)) TTL_finish_buffering(
770 TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering) {
772}
773/*
774 * TTL_duplex_scheme.h
775 *
776 * Copyright (c) 2025 Mobileye
777 *
778 * Licensed under the Apache License, Version 2.0 (the License);
779 * you may not use this file except in compliance with the License.
780 * You may obtain a copy of the License at
781 *
782 * http://www.apache.org/licenses/LICENSE-2.0
783 *
784 * Unless required by applicable law or agreed to in writing, software
785 * distributed under the License is distributed on an AS IS BASIS,
786 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
787 * See the License for the specific language governing permissions and
788 * limitations under the License.
789 */
790
791// clang-format off
792/**
793 * @file
794 *
795 * Given pair of blocking import and export that can execute concurrently,
796 * TTL_duplex_buffering issues them together and then waits on both to complete,
797 * hopefully executing them in parallel to each other. This scheme uses two
798 * internal buffers, one for the import and one for the export. Note that the
799 * export is pipelined to pair the import of the current tile with the export of
800 * previous tile.
801
802 * The following table draws the pipelined actions performed in duplex buffering.
803 * It specifies which tile is processed in each iteration:
804 *
805 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
806 * |-------------------|-----|-----|----------------------|---------------|
807 * | **Import** | 0 | 1 | i | |
808 * | **Wait Import** | 0 | 1 | i | |
809 * | **Compute** | 0 | 1 | i | |
810 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
811 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
812 *
813 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
814 *
815 * When including this file the following must be defined
816 *
817 * #define TTL_TENSOR_TYPE void
818 * #define TTL_TENSOR_TYPE uchar
819 * etc
820 *
821 * @example TTL_duplex_buffering.cl
822 */
823// clang-format on
824
825// This file presumes that the following have been pre included.
826// this is not done here for path reasons.
827// #include "TTL_core.h"
828// #include "TTL_import_export.h"
829// #include TTL_IMPORT_EXPORT_INCLUDE_H
830
831/**
832 * @def The structs used for this buffering type
833 */
834
835/**
836 * @brief Data required to perform duplex buffer pipelining.
837 *
838 * @see TTL_start_duplex_buffering for a description of duplex buffer
839 * pipelining.
840 */
841typedef struct {
842 struct {
843 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
844 0->1->0->1... etc */
845 __local int *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
846 TTL_ext_int_tensor_t ext_tensor_in; /*!< The external tensor being input */
847 TTL_ext_int_tensor_t ext_tensor_out; /*!< The external tensor being output */
848 } common; ///< The information that is common to all pipeline schemes
849
850 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
851 ///< external to internal transfers, the second for
852 ///< internal to external transfers
853
854 /**
855 * @brief Store of the buffers used for the previous import/export cycles.
856 *
857 */
858 struct {
861 } prev_out_tensors;
863
864/*
865 * Predeclare TTL_step_buffering.
866 */
867static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(
868 TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
869 TTL_tile_t tile_current_export);
870
871/**
872 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
873 *
874 * @param ext_tensor_in A tensor describing the input in global memory
875 * @param int_base_in The address of the local import buffer.
876 * @param ext_tensor_out A tensor describing the output in global memory
877 * @param int_base_out The address of the local export buffer.
878 * @param events A pointer to a list of 2 events.
879 * The first event in the list will be used for imports, the second event in
880 * the list will be used for exports.
881 * @param first_tile The first tile to fetch for the scheme
882 *
883 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
884 *
885 * The first event in the list will be used for imports,
886 * the second event in the list will be used for exports.
887 * \n\n Example:
888 * @code
889 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
890 *
891 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
892 * ext_base_in, ext_layout_in, l_buffers[0],
893 * ext_base_out, ext_layout_out, l_buffers[1],
894 * &events);
895 * @endcode
896 * \n
897 *
898 * @return The TTL_duplex_buffering_t created from the input parameters.
899 *
900 * Solid description of duplex buffering here.
901 *
902 * The simplest form of duplex buffering takes the following flow.
903 *
904 * @startuml
905 *
906 * start
907 *
908 * :Create a TTL_tiler_t with TTL_create_tiler;
909 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
910 * 1 input buffer, 1 output buffer;
911 * :NumberOfTiles = TTL_number_of_tiles(tiler);
912 *
913 * while (for each tile)
914 *
915 * :Import The Next Tile into the input buffer;
916 *
917 * :Process the Tile from the input buffer to the output buffer;
918 *
919 * :ExportThe Process Tile from into the output buffer;
920 *
921 * endwhile
922 *
923 * stop
924 *
925 * @enduml
926 *
927 * This can be optimized and standardized using the TTL_step_buffering
928 * call.
929 *
930 * @startuml
931 *
932 * start
933 *
934 * :Create a TTL_tiler_t with TTL_create_tiler;
935 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
936 * :NumberOfTiles = TTL_number_of_tiles(tiler);
937 *
938 * while (for each tile)
939 *
940 * :Call TTL_step_buffering for the current tile
941 *
942 * This will import the current new tile and export the last tile
943 * in parallel;
944 *
945 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
946 * :Process the Tile from the input buffer to the output buffer;
947 * endif
948 *
949 * endwhile
950 *
951 * stop
952 *
953 * @enduml
954 */
955static inline TTL_duplex_const_int_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(
956 TTL_ext_int_tensor_t ext_tensor_in, __local int *int_base_in, TTL_ext_int_tensor_t ext_tensor_out,
957 __local int *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
959 result.common.int_base[0] = int_base_in;
960 result.common.int_base[1] = int_base_out;
961
962 result.common.ext_tensor_in = ext_tensor_in;
963 result.common.ext_tensor_out = ext_tensor_out;
964 result.events = events;
967
968 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
969
970 return result;
971}
972
973static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(
974 TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
975 TTL_tile_t tile_current_export) {
976 const TTL_layout_t next_import_layout =
977 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
978 const TTL_const_ext_int_tensor_t next_import_ext_tensor =
980 tile_current_import.shape,
981 duplex_buffering->common.ext_tensor_in.layout,
982 tile_current_import.offset,
983 duplex_buffering->common.ext_tensor_in.elem_size);
984 const TTL_int_int_sub_tensor_t next_import_int_sub_tensor =
985 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
986 tile_current_import.shape,
987 next_import_layout,
988 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
989 tile_current_import.offset);
990
991 const TTL_const_int_int_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
992 const TTL_ext_int_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
993
994 if (TTL_tile_empty(tile_current_import) == false)
995 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
996 *TTL_to_void_tensor(&next_import_ext_tensor),
997 &(*duplex_buffering->events)[0]);
998
999 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
1000 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
1001 *TTL_to_void_tensor(&next_export_ext_tensor),
1002 &(*duplex_buffering->events)[1]);
1003
1004 const TTL_layout_t int_export_layout =
1005 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
1006 const TTL_ext_int_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
1007 tile_current_export.shape,
1008 duplex_buffering->common.ext_tensor_out.layout,
1009 tile_current_export.offset,
1010 duplex_buffering->common.ext_tensor_out.elem_size);
1011 const TTL_int_int_sub_tensor_t to_export_from =
1012 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
1013 tile_current_export.shape,
1014 int_export_layout,
1015 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1016 tile_current_export.offset);
1017
1018 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
1019 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
1020
1021 TTL_wait(2, *duplex_buffering->events);
1022
1023 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
1024}
1025
1026static inline void __attribute__((overloadable)) TTL_finish_buffering(
1027 TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering) {
1029}
1030/*
1031 * TTL_duplex_scheme.h
1032 *
1033 * Copyright (c) 2025 Mobileye
1034 *
1035 * Licensed under the Apache License, Version 2.0 (the License);
1036 * you may not use this file except in compliance with the License.
1037 * You may obtain a copy of the License at
1038 *
1039 * http://www.apache.org/licenses/LICENSE-2.0
1040 *
1041 * Unless required by applicable law or agreed to in writing, software
1042 * distributed under the License is distributed on an AS IS BASIS,
1043 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1044 * See the License for the specific language governing permissions and
1045 * limitations under the License.
1046 */
1047
1048// clang-format off
1049/**
1050 * @file
1051 *
1052 * Given pair of blocking import and export that can execute concurrently,
1053 * TTL_duplex_buffering issues them together and then waits on both to complete,
1054 * hopefully executing them in parallel to each other. This scheme uses two
1055 * internal buffers, one for the import and one for the export. Note that the
1056 * export is pipelined to pair the import of the current tile with the export of
1057 * previous tile.
1058
1059 * The following table draws the pipelined actions performed in duplex buffering.
1060 * It specifies which tile is processed in each iteration:
1061 *
1062 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
1063 * |-------------------|-----|-----|----------------------|---------------|
1064 * | **Import** | 0 | 1 | i | |
1065 * | **Wait Import** | 0 | 1 | i | |
1066 * | **Compute** | 0 | 1 | i | |
1067 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
1068 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
1069 *
1070 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
1071 *
1072 * When including this file the following must be defined
1073 *
1074 * #define TTL_TENSOR_TYPE void
1075 * #define TTL_TENSOR_TYPE uchar
1076 * etc
1077 *
1078 * @example TTL_duplex_buffering.cl
1079 */
1080// clang-format on
1081
1082// This file presumes that the following have been pre included.
1083// this is not done here for path reasons.
1084// #include "TTL_core.h"
1085// #include "TTL_import_export.h"
1086// #include TTL_IMPORT_EXPORT_INCLUDE_H
1087
1088/**
1089 * @def The structs used for this buffering type
1090 */
1091
1092/**
1093 * @brief Data required to perform duplex buffer pipelining.
1094 *
1095 * @see TTL_start_duplex_buffering for a description of duplex buffer
1096 * pipelining.
1097 */
1098typedef struct {
1099 struct {
1100 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1101 0->1->0->1... etc */
1102 __local uint *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
1103 TTL_ext_uint_tensor_t ext_tensor_in; /*!< The external tensor being input */
1104 TTL_ext_uint_tensor_t ext_tensor_out; /*!< The external tensor being output */
1105 } common; ///< The information that is common to all pipeline schemes
1106
1107 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
1108 ///< external to internal transfers, the second for
1109 ///< internal to external transfers
1110
1111 /**
1112 * @brief Store of the buffers used for the previous import/export cycles.
1113 *
1114 */
1115 struct {
1118 } prev_out_tensors;
1120
1121/*
1122 * Predeclare TTL_step_buffering.
1123 */
1124static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1125 TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
1126 TTL_tile_t tile_current_export);
1127
1128/**
1129 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
1130 *
1131 * @param ext_tensor_in A tensor describing the input in global memory
1132 * @param int_base_in The address of the local import buffer.
1133 * @param ext_tensor_out A tensor describing the output in global memory
1134 * @param int_base_out The address of the local export buffer.
1135 * @param events A pointer to a list of 2 events.
1136 * The first event in the list will be used for imports, the second event in
1137 * the list will be used for exports.
1138 * @param first_tile The first tile to fetch for the scheme
1139 *
1140 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
1141 *
1142 * The first event in the list will be used for imports,
1143 * the second event in the list will be used for exports.
1144 * \n\n Example:
1145 * @code
1146 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
1147 *
1148 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
1149 * ext_base_in, ext_layout_in, l_buffers[0],
1150 * ext_base_out, ext_layout_out, l_buffers[1],
1151 * &events);
1152 * @endcode
1153 * \n
1154 *
1155 * @return The TTL_duplex_buffering_t created from the input parameters.
1156 *
1157 * Solid description of duplex buffering here.
1158 *
1159 * The simplest form of duplex buffering takes the following flow.
1160 *
1161 * @startuml
1162 *
1163 * start
1164 *
1165 * :Create a TTL_tiler_t with TTL_create_tiler;
1166 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
1167 * 1 input buffer, 1 output buffer;
1168 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1169 *
1170 * while (for each tile)
1171 *
1172 * :Import The Next Tile into the input buffer;
1173 *
1174 * :Process the Tile from the input buffer to the output buffer;
1175 *
1176 * :ExportThe Process Tile from into the output buffer;
1177 *
1178 * endwhile
1179 *
1180 * stop
1181 *
1182 * @enduml
1183 *
1184 * This can be optimized and standardized using the TTL_step_buffering
1185 * call.
1186 *
1187 * @startuml
1188 *
1189 * start
1190 *
1191 * :Create a TTL_tiler_t with TTL_create_tiler;
1192 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
1193 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1194 *
1195 * while (for each tile)
1196 *
1197 * :Call TTL_step_buffering for the current tile
1198 *
1199 * This will import the current new tile and export the last tile
1200 * in parallel;
1201 *
1202 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
1203 * :Process the Tile from the input buffer to the output buffer;
1204 * endif
1205 *
1206 * endwhile
1207 *
1208 * stop
1209 *
1210 * @enduml
1211 */
1213 TTL_ext_uint_tensor_t ext_tensor_in, __local uint *int_base_in, TTL_ext_uint_tensor_t ext_tensor_out,
1214 __local uint *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
1216 result.common.int_base[0] = int_base_in;
1217 result.common.int_base[1] = int_base_out;
1218
1219 result.common.ext_tensor_in = ext_tensor_in;
1220 result.common.ext_tensor_out = ext_tensor_out;
1221 result.events = events;
1224
1225 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1226
1227 return result;
1228}
1229
1230static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1231 TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
1232 TTL_tile_t tile_current_export) {
1233 const TTL_layout_t next_import_layout =
1234 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
1235 const TTL_const_ext_uint_tensor_t next_import_ext_tensor =
1237 tile_current_import.shape,
1238 duplex_buffering->common.ext_tensor_in.layout,
1239 tile_current_import.offset,
1240 duplex_buffering->common.ext_tensor_in.elem_size);
1241 const TTL_int_uint_sub_tensor_t next_import_int_sub_tensor =
1242 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
1243 tile_current_import.shape,
1244 next_import_layout,
1245 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1246 tile_current_import.offset);
1247
1248 const TTL_const_int_uint_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
1249 const TTL_ext_uint_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
1250
1251 if (TTL_tile_empty(tile_current_import) == false)
1252 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1253 *TTL_to_void_tensor(&next_import_ext_tensor),
1254 &(*duplex_buffering->events)[0]);
1255
1256 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
1257 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
1258 *TTL_to_void_tensor(&next_export_ext_tensor),
1259 &(*duplex_buffering->events)[1]);
1260
1261 const TTL_layout_t int_export_layout =
1262 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
1263 const TTL_ext_uint_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
1264 tile_current_export.shape,
1265 duplex_buffering->common.ext_tensor_out.layout,
1266 tile_current_export.offset,
1267 duplex_buffering->common.ext_tensor_out.elem_size);
1268 const TTL_int_uint_sub_tensor_t to_export_from =
1269 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
1270 tile_current_export.shape,
1271 int_export_layout,
1272 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1273 tile_current_export.offset);
1274
1275 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
1276 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
1277
1278 TTL_wait(2, *duplex_buffering->events);
1279
1280 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
1281}
1282
1283static inline void __attribute__((overloadable)) TTL_finish_buffering(
1284 TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering) {
1286}
1287/*
1288 * TTL_duplex_scheme.h
1289 *
1290 * Copyright (c) 2025 Mobileye
1291 *
1292 * Licensed under the Apache License, Version 2.0 (the License);
1293 * you may not use this file except in compliance with the License.
1294 * You may obtain a copy of the License at
1295 *
1296 * http://www.apache.org/licenses/LICENSE-2.0
1297 *
1298 * Unless required by applicable law or agreed to in writing, software
1299 * distributed under the License is distributed on an AS IS BASIS,
1300 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1301 * See the License for the specific language governing permissions and
1302 * limitations under the License.
1303 */
1304
1305// clang-format off
1306/**
1307 * @file
1308 *
1309 * Given pair of blocking import and export that can execute concurrently,
1310 * TTL_duplex_buffering issues them together and then waits on both to complete,
1311 * hopefully executing them in parallel to each other. This scheme uses two
1312 * internal buffers, one for the import and one for the export. Note that the
1313 * export is pipelined to pair the import of the current tile with the export of
1314 * previous tile.
1315
1316 * The following table draws the pipelined actions performed in duplex buffering.
1317 * It specifies which tile is processed in each iteration:
1318 *
1319 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
1320 * |-------------------|-----|-----|----------------------|---------------|
1321 * | **Import** | 0 | 1 | i | |
1322 * | **Wait Import** | 0 | 1 | i | |
1323 * | **Compute** | 0 | 1 | i | |
1324 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
1325 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
1326 *
1327 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
1328 *
1329 * When including this file the following must be defined
1330 *
1331 * #define TTL_TENSOR_TYPE void
1332 * #define TTL_TENSOR_TYPE uchar
1333 * etc
1334 *
1335 * @example TTL_duplex_buffering.cl
1336 */
1337// clang-format on
1338
1339// This file presumes that the following have been pre included.
1340// this is not done here for path reasons.
1341// #include "TTL_core.h"
1342// #include "TTL_import_export.h"
1343// #include TTL_IMPORT_EXPORT_INCLUDE_H
1344
1345/**
1346 * @def The structs used for this buffering type
1347 */
1348
1349/**
1350 * @brief Data required to perform duplex buffer pipelining.
1351 *
1352 * @see TTL_start_duplex_buffering for a description of duplex buffer
1353 * pipelining.
1354 */
1355typedef struct {
1356 struct {
1357 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1358 0->1->0->1... etc */
1359 __local short *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
1360 TTL_ext_short_tensor_t ext_tensor_in; /*!< The external tensor being input */
1361 TTL_ext_short_tensor_t ext_tensor_out; /*!< The external tensor being output */
1362 } common; ///< The information that is common to all pipeline schemes
1363
1364 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
1365 ///< external to internal transfers, the second for
1366 ///< internal to external transfers
1367
1368 /**
1369 * @brief Store of the buffers used for the previous import/export cycles.
1370 *
1371 */
1372 struct {
1375 } prev_out_tensors;
1377
1378/*
1379 * Predeclare TTL_step_buffering.
1380 */
1381static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1382 TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
1383 TTL_tile_t tile_current_export);
1384
1385/**
1386 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
1387 *
1388 * @param ext_tensor_in A tensor describing the input in global memory
1389 * @param int_base_in The address of the local import buffer.
1390 * @param ext_tensor_out A tensor describing the output in global memory
1391 * @param int_base_out The address of the local export buffer.
1392 * @param events A pointer to a list of 2 events.
1393 * The first event in the list will be used for imports, the second event in
1394 * the list will be used for exports.
1395 * @param first_tile The first tile to fetch for the scheme
1396 *
1397 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
1398 *
1399 * The first event in the list will be used for imports,
1400 * the second event in the list will be used for exports.
1401 * \n\n Example:
1402 * @code
1403 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
1404 *
1405 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
1406 * ext_base_in, ext_layout_in, l_buffers[0],
1407 * ext_base_out, ext_layout_out, l_buffers[1],
1408 * &events);
1409 * @endcode
1410 * \n
1411 *
1412 * @return The TTL_duplex_buffering_t created from the input parameters.
1413 *
1414 * Solid description of duplex buffering here.
1415 *
1416 * The simplest form of duplex buffering takes the following flow.
1417 *
1418 * @startuml
1419 *
1420 * start
1421 *
1422 * :Create a TTL_tiler_t with TTL_create_tiler;
1423 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
1424 * 1 input buffer, 1 output buffer;
1425 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1426 *
1427 * while (for each tile)
1428 *
1429 * :Import The Next Tile into the input buffer;
1430 *
1431 * :Process the Tile from the input buffer to the output buffer;
1432 *
1433 * :ExportThe Process Tile from into the output buffer;
1434 *
1435 * endwhile
1436 *
1437 * stop
1438 *
1439 * @enduml
1440 *
1441 * This can be optimized and standardized using the TTL_step_buffering
1442 * call.
1443 *
1444 * @startuml
1445 *
1446 * start
1447 *
1448 * :Create a TTL_tiler_t with TTL_create_tiler;
1449 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
1450 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1451 *
1452 * while (for each tile)
1453 *
1454 * :Call TTL_step_buffering for the current tile
1455 *
1456 * This will import the current new tile and export the last tile
1457 * in parallel;
1458 *
1459 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
1460 * :Process the Tile from the input buffer to the output buffer;
1461 * endif
1462 *
1463 * endwhile
1464 *
1465 * stop
1466 *
1467 * @enduml
1468 */
1470 TTL_ext_short_tensor_t ext_tensor_in, __local short *int_base_in, TTL_ext_short_tensor_t ext_tensor_out,
1471 __local short *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
1473 result.common.int_base[0] = int_base_in;
1474 result.common.int_base[1] = int_base_out;
1475
1476 result.common.ext_tensor_in = ext_tensor_in;
1477 result.common.ext_tensor_out = ext_tensor_out;
1478 result.events = events;
1481
1482 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1483
1484 return result;
1485}
1486
1487static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1488 TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
1489 TTL_tile_t tile_current_export) {
1490 const TTL_layout_t next_import_layout =
1491 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
1492 const TTL_const_ext_short_tensor_t next_import_ext_tensor =
1494 tile_current_import.shape,
1495 duplex_buffering->common.ext_tensor_in.layout,
1496 tile_current_import.offset,
1497 duplex_buffering->common.ext_tensor_in.elem_size);
1498 const TTL_int_short_sub_tensor_t next_import_int_sub_tensor =
1499 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
1500 tile_current_import.shape,
1501 next_import_layout,
1502 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1503 tile_current_import.offset);
1504
1505 const TTL_const_int_short_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
1506 const TTL_ext_short_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
1507
1508 if (TTL_tile_empty(tile_current_import) == false)
1509 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1510 *TTL_to_void_tensor(&next_import_ext_tensor),
1511 &(*duplex_buffering->events)[0]);
1512
1513 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
1514 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
1515 *TTL_to_void_tensor(&next_export_ext_tensor),
1516 &(*duplex_buffering->events)[1]);
1517
1518 const TTL_layout_t int_export_layout =
1519 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
1520 const TTL_ext_short_tensor_t to_export_to =
1522 tile_current_export.shape,
1523 duplex_buffering->common.ext_tensor_out.layout,
1524 tile_current_export.offset,
1525 duplex_buffering->common.ext_tensor_out.elem_size);
1526 const TTL_int_short_sub_tensor_t to_export_from =
1527 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
1528 tile_current_export.shape,
1529 int_export_layout,
1530 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1531 tile_current_export.offset);
1532
1533 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
1534 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
1535
1536 TTL_wait(2, *duplex_buffering->events);
1537
1538 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
1539}
1540
1541static inline void __attribute__((overloadable)) TTL_finish_buffering(
1542 TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering) {
1544}
1545/*
1546 * TTL_duplex_scheme.h
1547 *
1548 * Copyright (c) 2025 Mobileye
1549 *
1550 * Licensed under the Apache License, Version 2.0 (the License);
1551 * you may not use this file except in compliance with the License.
1552 * You may obtain a copy of the License at
1553 *
1554 * http://www.apache.org/licenses/LICENSE-2.0
1555 *
1556 * Unless required by applicable law or agreed to in writing, software
1557 * distributed under the License is distributed on an AS IS BASIS,
1558 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1559 * See the License for the specific language governing permissions and
1560 * limitations under the License.
1561 */
1562
1563// clang-format off
1564/**
1565 * @file
1566 *
1567 * Given pair of blocking import and export that can execute concurrently,
1568 * TTL_duplex_buffering issues them together and then waits on both to complete,
1569 * hopefully executing them in parallel to each other. This scheme uses two
1570 * internal buffers, one for the import and one for the export. Note that the
1571 * export is pipelined to pair the import of the current tile with the export of
1572 * previous tile.
1573
1574 * The following table draws the pipelined actions performed in duplex buffering.
1575 * It specifies which tile is processed in each iteration:
1576 *
1577 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
1578 * |-------------------|-----|-----|----------------------|---------------|
1579 * | **Import** | 0 | 1 | i | |
1580 * | **Wait Import** | 0 | 1 | i | |
1581 * | **Compute** | 0 | 1 | i | |
1582 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
1583 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
1584 *
1585 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
1586 *
1587 * When including this file the following must be defined
1588 *
1589 * #define TTL_TENSOR_TYPE void
1590 * #define TTL_TENSOR_TYPE uchar
1591 * etc
1592 *
1593 * @example TTL_duplex_buffering.cl
1594 */
1595// clang-format on
1596
1597// This file presumes that the following have been pre included.
1598// this is not done here for path reasons.
1599// #include "TTL_core.h"
1600// #include "TTL_import_export.h"
1601// #include TTL_IMPORT_EXPORT_INCLUDE_H
1602
1603/**
1604 * @def The structs used for this buffering type
1605 */
1606
1607/**
1608 * @brief Data required to perform duplex buffer pipelining.
1609 *
1610 * @see TTL_start_duplex_buffering for a description of duplex buffer
1611 * pipelining.
1612 */
1613typedef struct {
1614 struct {
1615 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1616 0->1->0->1... etc */
1617 __local ushort *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
1618 TTL_ext_ushort_tensor_t ext_tensor_in; /*!< The external tensor being input */
1619 TTL_ext_ushort_tensor_t ext_tensor_out; /*!< The external tensor being output */
1620 } common; ///< The information that is common to all pipeline schemes
1621
1622 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
1623 ///< external to internal transfers, the second for
1624 ///< internal to external transfers
1625
1626 /**
1627 * @brief Store of the buffers used for the previous import/export cycles.
1628 *
1629 */
1630 struct {
1633 } prev_out_tensors;
1635
1636/*
1637 * Predeclare TTL_step_buffering.
1638 */
1639static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1640 TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
1641 TTL_tile_t tile_current_export);
1642
1643/**
1644 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
1645 *
1646 * @param ext_tensor_in A tensor describing the input in global memory
1647 * @param int_base_in The address of the local import buffer.
1648 * @param ext_tensor_out A tensor describing the output in global memory
1649 * @param int_base_out The address of the local export buffer.
1650 * @param events A pointer to a list of 2 events.
1651 * The first event in the list will be used for imports, the second event in
1652 * the list will be used for exports.
1653 * @param first_tile The first tile to fetch for the scheme
1654 *
1655 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
1656 *
1657 * The first event in the list will be used for imports,
1658 * the second event in the list will be used for exports.
1659 * \n\n Example:
1660 * @code
1661 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
1662 *
1663 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
1664 * ext_base_in, ext_layout_in, l_buffers[0],
1665 * ext_base_out, ext_layout_out, l_buffers[1],
1666 * &events);
1667 * @endcode
1668 * \n
1669 *
1670 * @return The TTL_duplex_buffering_t created from the input parameters.
1671 *
1672 * Solid description of duplex buffering here.
1673 *
1674 * The simplest form of duplex buffering takes the following flow.
1675 *
1676 * @startuml
1677 *
1678 * start
1679 *
1680 * :Create a TTL_tiler_t with TTL_create_tiler;
1681 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
1682 * 1 input buffer, 1 output buffer;
1683 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1684 *
1685 * while (for each tile)
1686 *
1687 * :Import The Next Tile into the input buffer;
1688 *
1689 * :Process the Tile from the input buffer to the output buffer;
1690 *
1691 * :ExportThe Process Tile from into the output buffer;
1692 *
1693 * endwhile
1694 *
1695 * stop
1696 *
1697 * @enduml
1698 *
1699 * This can be optimized and standardized using the TTL_step_buffering
1700 * call.
1701 *
1702 * @startuml
1703 *
1704 * start
1705 *
1706 * :Create a TTL_tiler_t with TTL_create_tiler;
1707 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
1708 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1709 *
1710 * while (for each tile)
1711 *
1712 * :Call TTL_step_buffering for the current tile
1713 *
1714 * This will import the current new tile and export the last tile
1715 * in parallel;
1716 *
1717 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
1718 * :Process the Tile from the input buffer to the output buffer;
1719 * endif
1720 *
1721 * endwhile
1722 *
1723 * stop
1724 *
1725 * @enduml
1726 */
1728 TTL_ext_ushort_tensor_t ext_tensor_in, __local ushort *int_base_in, TTL_ext_ushort_tensor_t ext_tensor_out,
1729 __local ushort *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
1731 result.common.int_base[0] = int_base_in;
1732 result.common.int_base[1] = int_base_out;
1733
1734 result.common.ext_tensor_in = ext_tensor_in;
1735 result.common.ext_tensor_out = ext_tensor_out;
1736 result.events = events;
1739
1740 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1741
1742 return result;
1743}
1744
1745static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1746 TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
1747 TTL_tile_t tile_current_export) {
1748 const TTL_layout_t next_import_layout =
1749 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
1750 const TTL_const_ext_ushort_tensor_t next_import_ext_tensor =
1752 tile_current_import.shape,
1753 duplex_buffering->common.ext_tensor_in.layout,
1754 tile_current_import.offset,
1755 duplex_buffering->common.ext_tensor_in.elem_size);
1756 const TTL_int_ushort_sub_tensor_t next_import_int_sub_tensor =
1757 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
1758 tile_current_import.shape,
1759 next_import_layout,
1760 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1761 tile_current_import.offset);
1762
1763 const TTL_const_int_ushort_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
1764 const TTL_ext_ushort_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
1765
1766 if (TTL_tile_empty(tile_current_import) == false)
1767 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1768 *TTL_to_void_tensor(&next_import_ext_tensor),
1769 &(*duplex_buffering->events)[0]);
1770
1771 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
1772 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
1773 *TTL_to_void_tensor(&next_export_ext_tensor),
1774 &(*duplex_buffering->events)[1]);
1775
1776 const TTL_layout_t int_export_layout =
1777 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
1778 const TTL_ext_ushort_tensor_t to_export_to =
1780 tile_current_export.shape,
1781 duplex_buffering->common.ext_tensor_out.layout,
1782 tile_current_export.offset,
1783 duplex_buffering->common.ext_tensor_out.elem_size);
1784 const TTL_int_ushort_sub_tensor_t to_export_from =
1785 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
1786 tile_current_export.shape,
1787 int_export_layout,
1788 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1789 tile_current_export.offset);
1790
1791 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
1792 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
1793
1794 TTL_wait(2, *duplex_buffering->events);
1795
1796 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
1797}
1798
1799static inline void __attribute__((overloadable)) TTL_finish_buffering(
1800 TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering) {
1802}
1803/*
1804 * TTL_duplex_scheme.h
1805 *
1806 * Copyright (c) 2025 Mobileye
1807 *
1808 * Licensed under the Apache License, Version 2.0 (the License);
1809 * you may not use this file except in compliance with the License.
1810 * You may obtain a copy of the License at
1811 *
1812 * http://www.apache.org/licenses/LICENSE-2.0
1813 *
1814 * Unless required by applicable law or agreed to in writing, software
1815 * distributed under the License is distributed on an AS IS BASIS,
1816 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1817 * See the License for the specific language governing permissions and
1818 * limitations under the License.
1819 */
1820
1821// clang-format off
1822/**
1823 * @file
1824 *
1825 * Given pair of blocking import and export that can execute concurrently,
1826 * TTL_duplex_buffering issues them together and then waits on both to complete,
1827 * hopefully executing them in parallel to each other. This scheme uses two
1828 * internal buffers, one for the import and one for the export. Note that the
1829 * export is pipelined to pair the import of the current tile with the export of
1830 * previous tile.
1831
1832 * The following table draws the pipelined actions performed in duplex buffering.
1833 * It specifies which tile is processed in each iteration:
1834 *
1835 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
1836 * |-------------------|-----|-----|----------------------|---------------|
1837 * | **Import** | 0 | 1 | i | |
1838 * | **Wait Import** | 0 | 1 | i | |
1839 * | **Compute** | 0 | 1 | i | |
1840 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
1841 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
1842 *
1843 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
1844 *
1845 * When including this file the following must be defined
1846 *
1847 * #define TTL_TENSOR_TYPE void
1848 * #define TTL_TENSOR_TYPE uchar
1849 * etc
1850 *
1851 * @example TTL_duplex_buffering.cl
1852 */
1853// clang-format on
1854
1855// This file presumes that the following have been pre included.
1856// this is not done here for path reasons.
1857// #include "TTL_core.h"
1858// #include "TTL_import_export.h"
1859// #include TTL_IMPORT_EXPORT_INCLUDE_H
1860
1861/**
1862 * @def The structs used for this buffering type
1863 */
1864
1865/**
1866 * @brief Data required to perform duplex buffer pipelining.
1867 *
1868 * @see TTL_start_duplex_buffering for a description of duplex buffer
1869 * pipelining.
1870 */
1871typedef struct {
1872 struct {
1873 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1874 0->1->0->1... etc */
1875 __local long *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
1876 TTL_ext_long_tensor_t ext_tensor_in; /*!< The external tensor being input */
1877 TTL_ext_long_tensor_t ext_tensor_out; /*!< The external tensor being output */
1878 } common; ///< The information that is common to all pipeline schemes
1879
1880 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
1881 ///< external to internal transfers, the second for
1882 ///< internal to external transfers
1883
1884 /**
1885 * @brief Store of the buffers used for the previous import/export cycles.
1886 *
1887 */
1888 struct {
1891 } prev_out_tensors;
1893
1894/*
1895 * Predeclare TTL_step_buffering.
1896 */
1897static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1898 TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
1899 TTL_tile_t tile_current_export);
1900
1901/**
1902 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
1903 *
1904 * @param ext_tensor_in A tensor describing the input in global memory
1905 * @param int_base_in The address of the local import buffer.
1906 * @param ext_tensor_out A tensor describing the output in global memory
1907 * @param int_base_out The address of the local export buffer.
1908 * @param events A pointer to a list of 2 events.
1909 * The first event in the list will be used for imports, the second event in
1910 * the list will be used for exports.
1911 * @param first_tile The first tile to fetch for the scheme
1912 *
1913 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
1914 *
1915 * The first event in the list will be used for imports,
1916 * the second event in the list will be used for exports.
1917 * \n\n Example:
1918 * @code
1919 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
1920 *
1921 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
1922 * ext_base_in, ext_layout_in, l_buffers[0],
1923 * ext_base_out, ext_layout_out, l_buffers[1],
1924 * &events);
1925 * @endcode
1926 * \n
1927 *
1928 * @return The TTL_duplex_buffering_t created from the input parameters.
1929 *
1930 * Solid description of duplex buffering here.
1931 *
1932 * The simplest form of duplex buffering takes the following flow.
1933 *
1934 * @startuml
1935 *
1936 * start
1937 *
1938 * :Create a TTL_tiler_t with TTL_create_tiler;
1939 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
1940 * 1 input buffer, 1 output buffer;
1941 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1942 *
1943 * while (for each tile)
1944 *
1945 * :Import The Next Tile into the input buffer;
1946 *
1947 * :Process the Tile from the input buffer to the output buffer;
1948 *
1949 * :ExportThe Process Tile from into the output buffer;
1950 *
1951 * endwhile
1952 *
1953 * stop
1954 *
1955 * @enduml
1956 *
1957 * This can be optimized and standardized using the TTL_step_buffering
1958 * call.
1959 *
1960 * @startuml
1961 *
1962 * start
1963 *
1964 * :Create a TTL_tiler_t with TTL_create_tiler;
1965 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
1966 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1967 *
1968 * while (for each tile)
1969 *
1970 * :Call TTL_step_buffering for the current tile
1971 *
1972 * This will import the current new tile and export the last tile
1973 * in parallel;
1974 *
1975 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
1976 * :Process the Tile from the input buffer to the output buffer;
1977 * endif
1978 *
1979 * endwhile
1980 *
1981 * stop
1982 *
1983 * @enduml
1984 */
1986 TTL_ext_long_tensor_t ext_tensor_in, __local long *int_base_in, TTL_ext_long_tensor_t ext_tensor_out,
1987 __local long *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
1989 result.common.int_base[0] = int_base_in;
1990 result.common.int_base[1] = int_base_out;
1991
1992 result.common.ext_tensor_in = ext_tensor_in;
1993 result.common.ext_tensor_out = ext_tensor_out;
1994 result.events = events;
1997
1998 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1999
2000 return result;
2001}
2002
2003static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(
2004 TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
2005 TTL_tile_t tile_current_export) {
2006 const TTL_layout_t next_import_layout =
2007 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
2008 const TTL_const_ext_long_tensor_t next_import_ext_tensor =
2010 tile_current_import.shape,
2011 duplex_buffering->common.ext_tensor_in.layout,
2012 tile_current_import.offset,
2013 duplex_buffering->common.ext_tensor_in.elem_size);
2014 const TTL_int_long_sub_tensor_t next_import_int_sub_tensor =
2015 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
2016 tile_current_import.shape,
2017 next_import_layout,
2018 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
2019 tile_current_import.offset);
2020
2021 const TTL_const_int_long_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
2022 const TTL_ext_long_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
2023
2024 if (TTL_tile_empty(tile_current_import) == false)
2025 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
2026 *TTL_to_void_tensor(&next_import_ext_tensor),
2027 &(*duplex_buffering->events)[0]);
2028
2029 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
2030 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
2031 *TTL_to_void_tensor(&next_export_ext_tensor),
2032 &(*duplex_buffering->events)[1]);
2033
2034 const TTL_layout_t int_export_layout =
2035 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
2036 const TTL_ext_long_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
2037 tile_current_export.shape,
2038 duplex_buffering->common.ext_tensor_out.layout,
2039 tile_current_export.offset,
2040 duplex_buffering->common.ext_tensor_out.elem_size);
2041 const TTL_int_long_sub_tensor_t to_export_from =
2042 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
2043 tile_current_export.shape,
2044 int_export_layout,
2045 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
2046 tile_current_export.offset);
2047
2048 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
2049 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
2050
2051 TTL_wait(2, *duplex_buffering->events);
2052
2053 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
2054}
2055
2056static inline void __attribute__((overloadable)) TTL_finish_buffering(
2057 TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering) {
2059}
2060/*
2061 * TTL_duplex_scheme.h
2062 *
2063 * Copyright (c) 2025 Mobileye
2064 *
2065 * Licensed under the Apache License, Version 2.0 (the License);
2066 * you may not use this file except in compliance with the License.
2067 * You may obtain a copy of the License at
2068 *
2069 * http://www.apache.org/licenses/LICENSE-2.0
2070 *
2071 * Unless required by applicable law or agreed to in writing, software
2072 * distributed under the License is distributed on an AS IS BASIS,
2073 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2074 * See the License for the specific language governing permissions and
2075 * limitations under the License.
2076 */
2077
2078// clang-format off
2079/**
2080 * @file
2081 *
2082 * Given pair of blocking import and export that can execute concurrently,
2083 * TTL_duplex_buffering issues them together and then waits on both to complete,
2084 * hopefully executing them in parallel to each other. This scheme uses two
2085 * internal buffers, one for the import and one for the export. Note that the
2086 * export is pipelined to pair the import of the current tile with the export of
2087 * previous tile.
2088
2089 * The following table draws the pipelined actions performed in duplex buffering.
2090 * It specifies which tile is processed in each iteration:
2091 *
2092 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
2093 * |-------------------|-----|-----|----------------------|---------------|
2094 * | **Import** | 0 | 1 | i | |
2095 * | **Wait Import** | 0 | 1 | i | |
2096 * | **Compute** | 0 | 1 | i | |
2097 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
2098 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
2099 *
2100 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
2101 *
2102 * When including this file the following must be defined
2103 *
2104 * #define TTL_TENSOR_TYPE void
2105 * #define TTL_TENSOR_TYPE uchar
2106 * etc
2107 *
2108 * @example TTL_duplex_buffering.cl
2109 */
2110// clang-format on
2111
2112// This file presumes that the following have been pre included.
2113// this is not done here for path reasons.
2114// #include "TTL_core.h"
2115// #include "TTL_import_export.h"
2116// #include TTL_IMPORT_EXPORT_INCLUDE_H
2117
2118/**
2119 * @def The structs used for this buffering type
2120 */
2121
2122/**
2123 * @brief Data required to perform duplex buffer pipelining.
2124 *
2125 * @see TTL_start_duplex_buffering for a description of duplex buffer
2126 * pipelining.
2127 */
2128typedef struct {
2129 struct {
2130 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
2131 0->1->0->1... etc */
2132 __local ulong *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
2133 TTL_ext_ulong_tensor_t ext_tensor_in; /*!< The external tensor being input */
2134 TTL_ext_ulong_tensor_t ext_tensor_out; /*!< The external tensor being output */
2135 } common; ///< The information that is common to all pipeline schemes
2136
2137 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
2138 ///< external to internal transfers, the second for
2139 ///< internal to external transfers
2140
2141 /**
2142 * @brief Store of the buffers used for the previous import/export cycles.
2143 *
2144 */
2145 struct {
2148 } prev_out_tensors;
2150
2151/*
2152 * Predeclare TTL_step_buffering.
2153 */
2154static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(
2155 TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
2156 TTL_tile_t tile_current_export);
2157
2158/**
2159 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
2160 *
2161 * @param ext_tensor_in A tensor describing the input in global memory
2162 * @param int_base_in The address of the local import buffer.
2163 * @param ext_tensor_out A tensor describing the output in global memory
2164 * @param int_base_out The address of the local export buffer.
2165 * @param events A pointer to a list of 2 events.
2166 * The first event in the list will be used for imports, the second event in
2167 * the list will be used for exports.
2168 * @param first_tile The first tile to fetch for the scheme
2169 *
2170 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
2171 *
2172 * The first event in the list will be used for imports,
2173 * the second event in the list will be used for exports.
2174 * \n\n Example:
2175 * @code
2176 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
2177 *
2178 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
2179 * ext_base_in, ext_layout_in, l_buffers[0],
2180 * ext_base_out, ext_layout_out, l_buffers[1],
2181 * &events);
2182 * @endcode
2183 * \n
2184 *
2185 * @return The TTL_duplex_buffering_t created from the input parameters.
2186 *
2187 * Solid description of duplex buffering here.
2188 *
2189 * The simplest form of duplex buffering takes the following flow.
2190 *
2191 * @startuml
2192 *
2193 * start
2194 *
2195 * :Create a TTL_tiler_t with TTL_create_tiler;
2196 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
2197 * 1 input buffer, 1 output buffer;
2198 * :NumberOfTiles = TTL_number_of_tiles(tiler);
2199 *
2200 * while (for each tile)
2201 *
2202 * :Import The Next Tile into the input buffer;
2203 *
2204 * :Process the Tile from the input buffer to the output buffer;
2205 *
2206 * :ExportThe Process Tile from into the output buffer;
2207 *
2208 * endwhile
2209 *
2210 * stop
2211 *
2212 * @enduml
2213 *
2214 * This can be optimized and standardized using the TTL_step_buffering
2215 * call.
2216 *
2217 * @startuml
2218 *
2219 * start
2220 *
2221 * :Create a TTL_tiler_t with TTL_create_tiler;
2222 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
2223 * :NumberOfTiles = TTL_number_of_tiles(tiler);
2224 *
2225 * while (for each tile)
2226 *
2227 * :Call TTL_step_buffering for the current tile
2228 *
2229 * This will import the current new tile and export the last tile
2230 * in parallel;
2231 *
2232 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
2233 * :Process the Tile from the input buffer to the output buffer;
2234 * endif
2235 *
2236 * endwhile
2237 *
2238 * stop
2239 *
2240 * @enduml
2241 */
2243 TTL_ext_ulong_tensor_t ext_tensor_in, __local ulong *int_base_in, TTL_ext_ulong_tensor_t ext_tensor_out,
2244 __local ulong *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
2246 result.common.int_base[0] = int_base_in;
2247 result.common.int_base[1] = int_base_out;
2248
2249 result.common.ext_tensor_in = ext_tensor_in;
2250 result.common.ext_tensor_out = ext_tensor_out;
2251 result.events = events;
2254
2255 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
2256
2257 return result;
2258}
2259
2260static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(
2261 TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
2262 TTL_tile_t tile_current_export) {
2263 const TTL_layout_t next_import_layout =
2264 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
2265 const TTL_const_ext_ulong_tensor_t next_import_ext_tensor =
2267 tile_current_import.shape,
2268 duplex_buffering->common.ext_tensor_in.layout,
2269 tile_current_import.offset,
2270 duplex_buffering->common.ext_tensor_in.elem_size);
2271 const TTL_int_ulong_sub_tensor_t next_import_int_sub_tensor =
2272 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
2273 tile_current_import.shape,
2274 next_import_layout,
2275 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
2276 tile_current_import.offset);
2277
2278 const TTL_const_int_ulong_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
2279 const TTL_ext_ulong_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
2280
2281 if (TTL_tile_empty(tile_current_import) == false)
2282 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
2283 *TTL_to_void_tensor(&next_import_ext_tensor),
2284 &(*duplex_buffering->events)[0]);
2285
2286 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
2287 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
2288 *TTL_to_void_tensor(&next_export_ext_tensor),
2289 &(*duplex_buffering->events)[1]);
2290
2291 const TTL_layout_t int_export_layout =
2292 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
2293 const TTL_ext_ulong_tensor_t to_export_to =
2295 tile_current_export.shape,
2296 duplex_buffering->common.ext_tensor_out.layout,
2297 tile_current_export.offset,
2298 duplex_buffering->common.ext_tensor_out.elem_size);
2299 const TTL_int_ulong_sub_tensor_t to_export_from =
2300 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
2301 tile_current_export.shape,
2302 int_export_layout,
2303 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
2304 tile_current_export.offset);
2305
2306 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
2307 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
2308
2309 TTL_wait(2, *duplex_buffering->events);
2310
2311 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
2312}
2313
2314static inline void __attribute__((overloadable)) TTL_finish_buffering(
2315 TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering) {
2317}
static int TTL_tile_empty(TTL_tile_t tile)
Check if the tile passed is empty.
static TTL_tile_t TTL_create_empty_tile()
Create an empty tile. Empty means it has all dimensions set to zero.
event_t TTL_event_t
TTL_event_t is a pseudonym for OpenCL event_t.
#define __global
The opencl __global namespace is not supported in C.
#define __local
The opencl __local namespace is not supported in C.
unsigned char uchar
OpenCL supports uchar so provide the same in c.
unsigned long ulong
OpenCL supports ulong so provide the same in c.
unsigned int uint
OpenCL supports uint so provide the same in c.
unsigned short ushort
OpenCL supports ushort so provide the same in c.
static void TTL_wait(const int num_events, TTL_event_t *const events)
static TTL_ext_void_tensor_t TTL_create_empty_ext_tensor(__global void *unused)
static TTL_ext_void_tensor_t TTL_create_ext_tensor(__global void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
const and non-const tensor creation functions.
static TTL_const_int_void_tensor_t TTL_create_empty_const_int_tensor(__local void *unused)
static const TTL_const_ext_void_tensor_t * TTL_to_const_tensor(const TTL_ext_void_tensor_t *const tensor)
static TTL_int_void_sub_tensor_t TTL_create_int_sub_tensor(__local void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_dim_t elem_size, const TTL_offset_t offset, const TTL_shape_t origin_shape, TTL_offset_t origin_offset)
const and non-const sub tensor creation functions.
static bool TTL_const_int_tensor_empty(TTL_const_int_void_tensor_t tensor)
static const TTL_ext_void_tensor_t * TTL_to_void_tensor(const TTL_ext_void_tensor_t *tensor)
static const TTL_ext_void_sub_tensor_t * TTL_to_void_sub_tensor(const TTL_ext_void_sub_tensor_t *tensor)
static TTL_const_ext_void_tensor_t TTL_create_const_ext_tensor(__global const void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
create TTL_create_int_tensor_impl
static TTL_layout_t TTL_create_layout(void)
Create a 1D Description of a Tensor layout in memory.
static void TTL_import_sub_tensor(const TTL_int_void_sub_tensor_t internal_sub_tensor, const TTL_const_ext_void_tensor_t const_external_tensor, TTL_event_t *event)
Implementation of TTL_import_sub_tensor.
static void TTL_export(const TTL_const_int_void_tensor_t internal_tensor, const TTL_ext_void_tensor_t external_tensor, TTL_event_t *event)
Export the external tensor to the internal tensor returning when complete.
static TTL_io_void_tensor_t TTL_step_buffering(TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import, TTL_tile_t tile_current_export)
static void TTL_finish_buffering(TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering)
static TTL_duplex_const_void_tensor_buffering_t TTL_start_duplex_buffering(TTL_ext_void_tensor_t ext_tensor_in, __local void *int_base_in, TTL_ext_void_tensor_t ext_tensor_out, __local void *int_base_out, TTL_event_t(*events)[2], TTL_tile_t first_tile)
Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process.
static TTL_io_void_tensor_t TTL_create_io_tensors(TTL_int_void_sub_tensor_t imported_to, TTL_int_void_sub_tensor_t to_export_from)
Create a TTL_io_tensors_t from a pair of tensors.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_char_tensor_buffering_t::@312336245141017307151106262305266046224352236304 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
struct TTL_duplex_const_char_tensor_buffering_t::@245341322112234032013261207342004241006352011135 common
The information that is common to all pipeline schemes.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_int_tensor_buffering_t::@116277316214163343364014260036133346034027220211 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
struct TTL_duplex_const_int_tensor_buffering_t::@031321325003017241265247300222255264315331130252 common
The information that is common to all pipeline schemes.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_long_tensor_buffering_t::@221205170375214144022317366166043265052152204367 common
The information that is common to all pipeline schemes.
struct TTL_duplex_const_long_tensor_buffering_t::@223266115357270112337123312124102216016115241150 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_short_tensor_buffering_t::@031117032142060271314225205226100011031155213012 common
The information that is common to all pipeline schemes.
struct TTL_duplex_const_short_tensor_buffering_t::@305005375365367365347353320116135040305054120071 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_uchar_tensor_buffering_t::@374075167261217125143156144124016260216104207155 common
The information that is common to all pipeline schemes.
struct TTL_duplex_const_uchar_tensor_buffering_t::@171067352313165204145131272276137230141144106114 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_uint_tensor_buffering_t::@264164370043316050136307275215132337220117222030 common
The information that is common to all pipeline schemes.
struct TTL_duplex_const_uint_tensor_buffering_t::@314044264022165160140333154000234013010035000255 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_ulong_tensor_buffering_t::@215357065262240261023047264364005000002326270252 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
struct TTL_duplex_const_ulong_tensor_buffering_t::@041036332263005324072266141315327247142350001176 common
The information that is common to all pipeline schemes.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_ushort_tensor_buffering_t::@075121152345321007207064066154035107000222121371 common
The information that is common to all pipeline schemes.
struct TTL_duplex_const_ushort_tensor_buffering_t::@264222272105300110370077135311164066262005000203 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_void_tensor_buffering_t::@075250261337271136226144050104370310367152033173 common
The information that is common to all pipeline schemes.
struct TTL_duplex_const_void_tensor_buffering_t::@050373116174146317217155137130151305330353001362 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Description of a Tensor layout in memory.
TTL_dim_t width
Number of elements along dimension x.
TTL_dim_t height
Number of rows along dimension y.
TTL_offset_t offset
TTL_shape_t shape