分析opencl banckend 在了解banckend 特定后端实现前,有几个函数需要注意
1 2 3 4 5 6 7 #define ALIMIN(x, y) ((x) < (y) ? (x) : (y)) #define ALIMAX(x, y) ((x) > (y) ? (x) : (y)) #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) #define ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y)) #define ALIGN_UP4(x) ROUND_UP((x), 4) #define ALIGN_UP8(x) ROUND_UP((x), 8)
ALIMIN(x, y) ((x) < (y) ? (x) : (y))
这个宏定义是一个简单的三元运算符,用来比较两个值 x 和 y 的大小,然后返回较小的那个值。具体来说,它的作用是:
检查 x 是否小于 y。
如果是,返回 x;如果不是,返回 y。
这个宏定义可以用于需要在两个值中选择较小值的情况,比如在编程中需要取两个数中的最小值时使用。
UP_DIV(x, y) (((x) + (y) - (1)) / (y))
如果x/y 刚好除尽UP_DIV(10,2)= 5
UP_DIV(4,2)= 2
如果x/y 除不尽 结果向上取整UP_DIV(10,3)= 4
ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y))
宏定义可以用于需要将数值向上取整到最接近的某个倍数的情况,比如内存对齐等 比如ROUND_UP(4, 8) = 8
ROUND_UP(9, 8) = 16
ROUND_UP(8, 8) = 8
opencl runtime 一个backend 都会由一个runtime。runtime也就是backend的核心成员,runtime 主要包装了Opencl的一些方法,以及Context,Events等底层变量。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 class OpenCLRuntime {public : OpenCLRuntime (const BackendConfig::PrecisionMode precision, const int cl_mode, int platformSize, int platformId, int deviceId); ~OpenCLRuntime (); OpenCLRuntime (const OpenCLRuntime &) = delete ; OpenCLRuntime &operator =(const OpenCLRuntime &) = delete ; bool isSupportedFP16 () const ; bool isWeightCpuTransHalf () const ; bool isDeviceSupportedFP16 () const ; bool isDeviceSupportedLowPower () const ; bool isSupportedDotInt8 () const ; bool isSupportedDotAccInt8 () const ; bool isSupportedIntelSubgroup () const ; ::cl::Context &context () ; ::cl::CommandQueue &commandQueue () ; ::cl::CommandQueue &recordableQueue () ; uint64_t deviceGlobalMemeryCacheSize () const ; uint32_t deviceComputeUnits () const ; uint32_t MaxThreadsPerDevice () const ; uint32_t MaxWorkGroupSize () const ; uint32_t maxFreq () const ; uint64_t getMaxWorkGroupSize (const ::cl::Kernel &kernel) ; uint64_t GetKernelWaveSize (const cl::Kernel &kernel) ; std::vector<uint32_t > getMaxWorkItemSizes () ; uint64_t getMaxLocalMem () const ; uint32_t getUseRecordableQueueSize () { return mUseRecordableQueueSize; } bool isSupportRecordQueue () { return mUseRecordQueue; } bool isDevideOpRecord () { return mDevideOpRecord; } GpuType getGpuType () { return mGpuType; } MaliAr getMaliAr () { return mMaliAr; } float getCLVersion () { return mCLVersion; } #ifdef MNN_OPENCL_SVM_ENABLE cl_device_svm_capabilities getSvmCapabilities () { return mSvmCapabilities; } #endif GpuMemObject getGpuMemType () { return mMemType; } CLTuneLevel getCLTuneLevel () { return mTuneLevel; } std::string getDeviceName () { return mDeviceName; } void pushEvent (std::pair<std::string, cl::Event> data) { return mEvents.push_back (data); } void printEventTime () ; void clearEvent () { mKernelTime = 0 ; mEvents.clear (); } uint64_t maxAllocSize () const ; void setCommandQueueProfileEnable () ; void setCommandQueueProfileDisable () ; unsigned int mQueueCount = 0 ; unsigned int getQueueNum () ; unsigned int mKernelTime = 0 ; std::map<std::pair<std::string, std::vector<uint32_t >>, std::pair<std::vector<uint32_t >, uint32_t >>& tunedLwsMap (); std::map<std::string, std::vector<std::pair<std::vector<uint32_t >, std::pair<std::vector<uint32_t >, uint32_t >>>>& getTuneLwsMap (); ::cl::Kernel buildKernel (const std::string &programName, const std::string &kernelName, const std::set<std::string> &buildOptions) ; ::cl::Kernel buildKernelFromSource (const std::string&, const std::string &kernelName, const std::set<std::string> &buildOptions) ; std::vector<size_t > getMaxImage2DSize () ; bool isCreateError () const { return mIsCreateError; } float flops () const { return mFlops; } double getCostTime (const cl::Event *event) ; double getQueuedTime (const cl::Event *event) ; double getSubmitTime (const cl::Event *event) ; std::pair<const void *, size_t > makeCache (void * tuneInfo) ; bool setCache (std::pair<const void *, size_t > cache) ; std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr; private : bool loadProgram (const std::string &programName, cl::Program *program) ; bool buildProgram (const std::string &buildOptionsStr, cl::Program *program) ; bool getDeviceSupportsExtension (const cl::Device &device, const char *extensionName) ; void setGpuMode (const int cl_mode_num) ; private : std::shared_ptr<::cl::Context> mContext; std::shared_ptr<::cl::Device> mFirstGPUDevicePtr; std::map<std::tuple<std::string, std::string>, ::cl::Program> mBuildProgramMap; std::shared_ptr<::cl::CommandQueue> mRecordableQueuePtr; uint64_t mGPUGlobalMemeryCacheSize; uint32_t mGPUComputeUnits; uint32_t mMaxFreq; uint32_t mMaxMemAllocSize; uint64_t mMaxLocalMemSize; uint32_t mMaxThreadsPerDevice; uint32_t mMaxWorkGroupSize; uint32_t mUseRecordableQueueSize; bool mUseRecordQueue = false ; bool mDevideOpRecord = true ; bool mIsSupportedFP16 = false ; bool mIsDeviceSupportedFP16 = false ; bool mIsDeviceSupportedLowPower = false ; bool mSupportDotInt8 = false ; bool mSupportDotAccInt8 = false ; bool mSupportedIntelSubgroup = false ; GpuType mGpuType; MaliAr mMaliAr; float mCLVersion = 1.0f ; std::vector<std::pair<std::string, cl::Event>> mEvents; #ifdef MNN_OPENCL_SVM_ENABLE cl_device_svm_capabilities mSvmCapabilities; #endif GpuMemObject mMemType = AUTO; CLTuneLevel mTuneLevel = Wide; std::string mDeviceName; bool isSetWorkGroupAttribute = false ; std::string mDefaultBuildParams; float mFlops = 4.0f ; bool mIsCreateError{false }; double mStartNanos; double mStopNanos; std::map<std::pair<std::string, std::vector<uint32_t >>, std::pair<std::vector<uint32_t >, uint32_t >> mTunedLws; std::map<std::string, std::vector<std::pair<std::vector<uint32_t >, std::pair<std::vector<uint32_t >, uint32_t >>>> mTuneLws; std::vector<uint8_t > mBuffer; const void * mCacheOutside = nullptr ; size_t mCacheOutsideSize = 0 ; }; } #endif
opencl CLRuntime CLRuntime 是第二次封装。OpenclRuntime是对底层的抽象,而CLRuntime可以理解为中间层,顶层OpenclBancked和底层OpenclRuntime。 让我们来看看CLRuntime的构造函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 CLRuntime::CLRuntime (const Backend::Info& info, int platformSize, int platformId, int deviceId){ mInfo = info; BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal; BackendConfig::PowerMode power = BackendConfig::Power_Normal; BackendConfig::MemoryMode memory = BackendConfig::Memory_Normal; if (nullptr != mInfo.user) { precision = mInfo.user->precision; power = mInfo.user->power; memory = mInfo.user->memory; } mOpenCLRuntime.reset (new OpenCLRuntime (precision, mInfo.gpuMode, platformSize, platformId, deviceId)); mCLRuntimeError = mOpenCLRuntime->isCreateError (); mPrecision = precision; mMemory = memory; mTunedInfo = new TuneInfo; mImagePool.reset (new ImagePool (mOpenCLRuntime->context ())); mBufferPool.reset (new BufferPool (mOpenCLRuntime->context (), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR)); }
主要新增了mImagePool
Image池和Buffer
池,来管理内存。
Buffer Pool 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 class BufferPool : public NonCopyable {public : BufferPool (cl::Context& context, cl_mem_flags flags) : mContext (context) { mFlag = flags; } cl::Buffer* alloc (int size, bool separate = false ) ; void recycle (cl::Buffer* buffer, bool release = false ) ; void clear () ; void releaseFreeList () ; size_t totalSize () { return mTotalSize; } struct Node { int size; std::shared_ptr<cl::Buffer> buffer; }; private : std::map<cl::Buffer*, std::shared_ptr<Node>> mAllBuffer; std::multimap<int , std::shared_ptr<Node>> mFreeList; cl::Context& mContext; cl_mem_flags mFlag; size_t mTotalSize = 0 ; };
在创建CLRuntime的时候,mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
来创建bufferpool
.
bufferpool 申请内存 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 cl::Buffer* BufferPool::alloc (int size, bool separate) { if (!separate) { auto iter = mFreeList.lower_bound (size); if (iter != mFreeList.end ()) { auto buffer = iter->second->buffer.get (); mFreeList.erase (iter); return buffer; } } std::shared_ptr<Node> node (new Node) ; cl_int ret = CL_SUCCESS; mTotalSize += size; node->size = size; node->buffer.reset (new cl::Buffer (mContext, mFlag, size, NULL , &ret)); if (nullptr == node->buffer.get () || ret != CL_SUCCESS) { MNN_ERROR ("Alloc Buffer %d error, code:%d \n" , size, ret); return nullptr ; } mAllBuffer.insert (std::make_pair (node->buffer.get (), node)); return node->buffer.get (); }
其中node
的定义为
1 2 3 4 struct Node { int size; std::shared_ptr<cl::Buffer> buffer; };
bufferpool回收内存 1 2 3 4 5 6 7 8 9 10 11 12 13 class CLMemReleaseBuffer : public Backend::MemObj {public : CLMemReleaseBuffer (cl::Buffer* bId, BufferPool* bufferPool) { mBuffer = bId; mBufferPool = bufferPool; } virtual ~ CLMemReleaseBuffer () { mBufferPool->recycle (mBuffer); } private : cl::Buffer* mBuffer; BufferPool* mBufferPool; };
在向内存池申请完内存后,对外不是直接返回内存指针而是使用new CLMemReleaseBuffer(buffer, mStaticBufferPool.get())
来返回一个CLMemReleaseBuffer
对象,这个对象记录你的内存地址,和内存池。通过这个类来访问内存。当他析构时,将内存回收到内存池中。
需要提出的是,这个时候内存池虽然建好了,真正分配内存,是各个OP实例化时,调用算子类比如卷积Cov类时,onsize()才会在内存中池中,创建或者申请一段内存。 比如:
1 2 3 4 5 6 ConvBufWinograd::onResize (const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) { ... mOpenCLBackend->onAcquireBuffer (mSource.get (), Backend::DYNAMIC); ... }
OpenCLBackend::onAcquire 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 Backend::MemObj* OpenCLBackend::onAcquire (const Tensor* nativeTensor, StorageType storageType) { #ifdef LOG_VERBOSE MNN_PRINT ("Start OpenCLBackend::onAcquireBuffer !\n" ); #endif auto tensorShape = OpenCL::tensorShapeFormat (nativeTensor); int N = tensorShape.at (0 ); int H = tensorShape.at (1 ); int W = tensorShape.at (2 ); int C = tensorShape.at (3 ); #ifdef LOG_VERBOSE MNN_PRINT ("OpenCLBackend::onAcquireBuffer: NHWC:[%d, %d, %d, %d]\n" , N, H, W, C); #endif #ifndef MNN_OPENCL_BUFFER_CLOSED if (mOpenCLRuntime->getGpuMemType () == BUFFER) { size_t size; if (nativeTensor->dimensions () >= 2 ) { auto alignC = ROUND_UP (C, 8 ); auto hR = ROUND_UP (H + 3 , 4 ) - H; auto wR = ROUND_UP (W + 3 , 4 ) - W; size = N * alignC * W * H; size = size + hR * W * 4 + wR * 4 ; } else { size = nativeTensor->elementSize (); size = ROUND_UP (size, 4 ); } if (mOpenCLRuntime->isSupportedIntelSubgroup ()) { int cPack = TensorUtils::getTensorChannelPack (nativeTensor); auto pads = TensorUtils::getDescribe (nativeTensor)->mPads; size_t imageWidth = (size_t ) ROUND_UP (UP_DIV (C, cPack), 2 ) * ROUND_UP (pads.left + W + pads.right, 4 ); size_t imageHeight = (size_t )N * H; size = imageWidth*imageHeight*cPack; } cl_channel_type dataType = CL_FLOAT; if (getOpenCLRuntime ()->isSupportedFP16 ()) { dataType = CL_HALF_FLOAT; } if (storageType == DYNAMIC_SEPERATE) { auto buffer = mBufferPool->alloc (size* (dataType==CL_HALF_FLOAT?sizeof (half_float::half):sizeof (float )), true ); ((Tensor*)nativeTensor)->buffer ().device = (uint64_t )buffer; return new CLMemReleaseBuffer (buffer, mBufferPool.get ()); } if (storageType == DYNAMIC) { auto buffer = mBufferPool->alloc (size* (dataType==CL_HALF_FLOAT?sizeof (half_float::half):sizeof (float ))); ((Tensor*)nativeTensor)->buffer ().device = (uint64_t )buffer; return new CLMemReleaseBuffer (buffer, mBufferPool.get ()); } MNN_ASSERT (storageType == STATIC); #ifdef MNN_LOW_MEMORY if ((nativeTensor->getType ().code == halide_type_int) && (nativeTensor->getType ().bits == 8 || nativeTensor->getType ().bits == 4 )) { size_t alloc_size = size; if (nativeTensor->getType ().bits == 4 ) { alloc_size = size / 2 ; } auto buffer = mStaticBufferPool->alloc (alloc_size); ((Tensor*)nativeTensor)->buffer ().device = (uint64_t )buffer; return new CLMemReleaseBuffer (buffer, mStaticBufferPool.get ()); } #endif auto buffer = mStaticBufferPool->alloc (size* (dataType == CL_HALF_FLOAT ? sizeof (half_float::half) : sizeof (float ))); ((Tensor*)nativeTensor)->buffer ().device = (uint64_t )buffer; return new CLMemReleaseBuffer (buffer, mStaticBufferPool.get ()); } else #endif { size_t imageWidth = (size_t ) (UP_DIV (C, 4 ) * W); size_t imageHeight = (size_t )N * H; cl_channel_type dataType = CL_HALF_FLOAT; if (mPrecision == BackendConfig::Precision_High) { dataType = CL_FLOAT; } if (storageType == DYNAMIC_SEPERATE) { auto image = mImagePool->alloc (imageWidth, imageHeight, dataType, true ); ((Tensor*)nativeTensor)->buffer ().device = (uint64_t )image; return new CLMemReleaseImage (image, mImagePool.get ()); } if (storageType == DYNAMIC) { auto image = mImagePool->alloc (imageWidth, imageHeight, dataType); ((Tensor*)nativeTensor)->buffer ().device = (uint64_t )image; return new CLMemReleaseImage (image, mImagePool.get ()); } MNN_ASSERT (storageType == STATIC); auto image = mStaticImagePool->alloc (imageWidth, imageHeight, dataType); ((Tensor*)nativeTensor)->buffer ().device = (uint64_t )image; return new CLMemReleaseImage (image, mStaticImagePool.get ()); } }
OpenCLBackend::onAcquire
是openclbackend 对外暴露的接口,主要是做了对Tensor进行计算对其后的空间大小,调用CLMemReleaseImage
对一个native Tensor分配内存。