Tensor Tiling Library
 
Loading...
Searching...
No Matches
p/pipelines/TTL_simplex_scheme.h
Go to the documentation of this file.
1/*
2 * TTL_simplex_scheme.h
3 *
4 * Copyright (c) 2025 Mobileye
5 *
6 * Licensed under the Apache License, Version 2.0 (the License);
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19// clang-format off
20/**
21 * @file
22 *
23 * TTL_simplex_buffering pipelines a pair of import and export transactions using
24 * three internal buffers, in rotation: each buffer interchangeably serves as input
25 * buffer and output buffer, such that in each iteration one buffer is used both to
26 * export then import and two buffers are used by compute for reading and writing.
27 *
28 * With simplex buffering we're only waiting for previous iterations, so DMA
29 * transactions run mostly in parallel to computation, but serially with each
30 * other. Using the same buffer both for import and export is possible allowing us
31 * to overlap exporting from and importing to the same buffer.
32 *
33 * The following table draws the pipelined actions performed in simplex buffering.
34 * It specifies which tile is processed in each iteration:
35 *
36 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
37 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
38 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
39 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
40 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
41 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
42 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
43 *
44 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
45 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
46 *
47 * @example TTL_simplex_buffering.cl
48 */
49// clang-format on
50
51// This file presumes that the following have been pre included.
52// this is not done here for path reasons.
53// #include "TTL_core.h"
54// #include "TTL_import_export.h"
55// #include TTL_IMPORT_EXPORT_INCLUDE_H
57#include "TTL_schemes_common.h"
58
59// TTL_simplex_buffering_t
60template <typename TENSORTYPE>
62 /**
63 * @brief Create a TTL_simplex_buffering and begin the buffering process
64 *
65 * @param int_base1 The address of the first buffer to be used in local memory
66 * @param int_base2 The address of the second buffer to be used in local memory
67 * @param int_base3 The address of the third buffer to be used in local memory
68 * @param ext_tensor_in The external tensor to import the input data from
69 * @param ext_tensor_out The external tensor to export the output data to
70 * @param m_event_in A pointer to the event to use for the inward (external to
71 * internal) transfer completion
72 * @param m_event_out A pointer to the event to use for the inward (internal to
73 * external) transfer completion
74 * @param first_tile The first tile to fetch for the scheme
75 *
76 * Solid description of TTL_simplex_buffering buffering here
77 *
78 * @return The TTL_simplex_buffering created from the input parameters
79 *
80 * Example:
81 * @code
82 * TTL_event tb_e_in = TTL_get_event();
83 * TTL_event tb_e_out = TTL_get_event();
84 * TTL_simplex_buffering tb_scheme(
85 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
86 * ext_layout_out, &tb_e_in, &tb_e_out);
87 * @endcode
88 * \n
89 *
90 * This can be optimized and standardized using the TTL_step_buffering
91 * call.
92 *
93 * @startuml
94 *
95 * start
96 *
97 *
98 * stop
99 *
100 * @enduml
101 *
102 */
103 TTL_simplex_buffering(TENSORTYPE *const int_base1, TENSORTYPE *const int_base2, TENSORTYPE *const int_base3,
104 const TTL_tensor<TENSORTYPE> &ext_tensor_in, const TTL_tensor<TENSORTYPE> &ext_tensor_out,
105 TTL_event *input_event_in, TTL_event *input_event_out, const TTL_tile first_tile) {
106 m_common.int_base[0] = int_base1;
107 m_common.int_base[1] = int_base2;
108 m_common.int_base[2] = int_base3;
109 m_common.ext_tensor_in = ext_tensor_in;
110 m_common.ext_tensor_out = ext_tensor_out;
111 m_event_in = input_event_in;
112 m_event_out = input_event_out;
114
115 m_common.index = 0;
116
118
119 step_buffering(first_tile, TTL_tile());
120 }
121
122 TTL_io_tensors<TENSORTYPE> step_buffering(const TTL_tile &tile_next_import, const TTL_tile &tile_current_export) {
123 // For performance, compute everything possible before waiting for the previous operations to finish. The
124 // current index contains the tile that is to be exported, so prepare the structures before beginning the export
125 // and export.
126 const TTL_layout next_import_layout(tile_next_import.shape.width, tile_next_import.shape.height);
127 const TTL_sub_tensor<TENSORTYPE> next_import_int_sub_tensor(m_common.int_base[m_common.index],
128 tile_next_import.shape,
129 next_import_layout,
130 m_common.ext_tensor_in,
131 tile_next_import.offset);
132 const TTL_tensor<TENSORTYPE> next_import_ext_tensor(m_common.ext_tensor_in.base,
133 tile_next_import.shape,
134 m_common.ext_tensor_in.layout,
135 tile_next_import.offset,
136 m_common.ext_tensor_in.elem_size);
137
138 const TTL_layout int_export_layout(m_next_exported_tile.shape.width, m_next_exported_tile.shape.height);
139 const TTL_tensor<TENSORTYPE> int_export_tensor(m_common.int_base[m_common.index],
141 int_export_layout,
142 m_common.ext_tensor_out.elem_size);
143 const TTL_tensor<TENSORTYPE> export_to(m_common.ext_tensor_out.base,
145 m_common.ext_tensor_out.layout,
147 m_common.ext_tensor_out.elem_size);
148
149 // Wait for the previous (import/export)s to complete before starting the next.
152
153 if (m_next_exported_tile.empty() == false) TTL_export(int_export_tensor, export_to, m_event_out);
154
155 if (tile_next_import.empty() == false)
156 TTL_import_sub_tensor(next_import_int_sub_tensor, next_import_ext_tensor, m_event_in);
157
158 // The import/export has been started for the current tile, Move to the next
159 // tile.
160 m_common.index = (m_common.index + 1) % TTL_ARRAYSIZE(m_common.int_base); // Write to.
161
162 // Retrieve buffer imported previously to read from now.
163 const TTL_sub_tensor<TENSORTYPE> int_curr_buff_in = m_int_prev_imported;
164 m_int_prev_imported = next_import_int_sub_tensor;
165
166 // Can write to out buffer according to size of curr_tile, rather than size
167 // recently exported.
168 const TTL_layout curr_int_layout(tile_current_export.shape.width, tile_current_export.shape.width);
169 const TTL_sub_tensor<TENSORTYPE> int_curr_buff_out(m_common.int_base[m_common.index],
170 tile_current_export.shape,
171 curr_int_layout,
172 m_common.ext_tensor_in,
173 tile_current_export.offset);
174
175 // Save last two tiles to prevent m_common repeated get_tile()'s.
176 m_next_exported_tile = tile_current_export;
177
178 return TTL_io_tensors(int_curr_buff_in, int_curr_buff_out);
179 }
180
181 /**
182 * @brief Complete any transfers required to finish the buffering process.
183 *
184 * Any transfers that are still in progress will be completed and any transfers
185 * that need to be started and completed before finish_buffering returns
186 */
191
192 TTL_common_buffering<TENSORTYPE, 3> m_common; ///< The information that is m_common to all pipeline schemes
193
196 // Cache previous gotten tiles.
198 TTL_sub_tensor<TENSORTYPE> m_int_prev_imported; // Cache previously imported internal buffer.
199};
static void TTL_wait(const int num_events, TTL_event_t *const events)
#define TTL_ARRAYSIZE(x)
Return the number of elements in the array x.
event_t TTL_event
TTL_event is a pseudonym for OpenCL event_t.
static void TTL_import_sub_tensor(const TTL_int_void_sub_tensor_t internal_sub_tensor, const TTL_const_ext_void_tensor_t const_external_tensor, TTL_event_t *event)
Implementation of TTL_import_sub_tensor.
static void TTL_export(const TTL_const_int_void_tensor_t internal_tensor, const TTL_ext_void_tensor_t external_tensor, TTL_event_t *event)
Export the external tensor to the internal tensor returning when complete.
Describes a pair of internal Tensors after an operation.
Description of a Tensor layout in memory.
TTL_dim height
Number of rows along dimension y.
TTL_dim width
Number of elements along dimension x.
TTL_sub_tensor< TENSORTYPE > m_int_prev_imported
TTL_io_tensors< TENSORTYPE > step_buffering(const TTL_tile &tile_next_import, const TTL_tile &tile_current_export)
void finish_buffering()
Complete any transfers required to finish the buffering process.
TTL_simplex_buffering(TENSORTYPE *const int_base1, TENSORTYPE *const int_base2, TENSORTYPE *const int_base3, const TTL_tensor< TENSORTYPE > &ext_tensor_in, const TTL_tensor< TENSORTYPE > &ext_tensor_out, TTL_event *input_event_in, TTL_event *input_event_out, const TTL_tile first_tile)
Create a TTL_simplex_buffering and begin the buffering process.
TTL_common_buffering< TENSORTYPE, 3 > m_common
The information that is m_common to all pipeline schemes.
A tensor plus its reference to its parent tensor.
A poor mans base class for an a tensor in the passed address space.
TTL_offset offset
TTL_shape shape
bool empty() const
Check if the tile passed is empty.