Nomic vulkan backend licensed under the Software for Open Models License (SOM), version 1.0.

2023-06-22 12:58:07 +02:00 · 2023-06-22 12:58:07 +02:00 · 4cdaa3c9cb
commit 4cdaa3c9cb
parent acfc5478ff
97 changed files with 13550 additions and 26 deletions
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@ -0,0 +1,450 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include <fstream>
+
+#include "kompute/Algorithm.hpp"
+
+namespace kp {
+
+Algorithm::~Algorithm()
+{
+    KP_LOG_DEBUG("Kompute Algorithm Destructor started");
+
+    this->destroy();
+}
+
+bool
+Algorithm::isInit()
+{
+    return this->mPipeline && this->mPipelineCache && this->mPipelineLayout &&
+           this->mDescriptorPool && this->mDescriptorSet &&
+           this->mDescriptorSetLayout && this->mShaderModule;
+}
+
+void
+Algorithm::destroy()
+{
+    // We don't have to free memory on destroy as it's freed by the
+    // commandBuffer destructor if (this->mPushConstantsData) {
+    //     free(this->mPushConstantsData);
+    // }
+    // if (this->mSpecializationConstantsData) {
+    //     free(this->mSpecializationConstantsData);
+    // }
+
+    if (!this->mDevice) {
+        KP_LOG_WARN("Kompute Algorithm destroy function reached with null "
+                    "Device pointer");
+        return;
+    }
+
+    if (this->mFreePipeline && this->mPipeline) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline");
+        if (!this->mPipeline) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipeline,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipeline = nullptr;
+    }
+
+    if (this->mFreePipelineCache && this->mPipelineCache) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline cache");
+        if (!this->mPipelineCache) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline cache but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineCache = nullptr;
+    }
+
+    if (this->mFreePipelineLayout && this->mPipelineLayout) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
+        if (!this->mPipelineLayout) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline layout but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineLayout = nullptr;
+    }
+
+    if (this->mFreeShaderModule && this->mShaderModule) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying shader module");
+        if (!this->mShaderModule) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader "
+                        "module but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mShaderModule,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mShaderModule = nullptr;
+    }
+
+    freeParameters();
+}
+
+void
+Algorithm::freeParameters()
+{
+    if (this->mFreeDescriptorSetLayout && this->mDescriptorSetLayout) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Set Layout");
+        if (!this->mDescriptorSetLayout) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "descriptor set layout but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mDescriptorSetLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDescriptorSetLayout = nullptr;
+    }
+}
+
+void
+Algorithm::createParameters()
+{
+    KP_LOG_DEBUG("Kompute Algorithm createParameters started");
+    if (!*this->mDescriptorPool) {
+        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
+        return;
+    }
+
+    std::vector<vk::DescriptorSetLayoutBinding> descriptorSetBindings;
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        descriptorSetBindings.push_back(
+          vk::DescriptorSetLayoutBinding(i, // Binding index
+                                         vk::DescriptorType::eStorageBuffer,
+                                         1, // Descriptor count
+                                         vk::ShaderStageFlagBits::eCompute));
+    }
+
+    // This is the component that is fed into the pipeline
+    vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo(
+      vk::DescriptorSetLayoutCreateFlags(),
+      static_cast<uint32_t>(descriptorSetBindings.size()),
+      descriptorSetBindings.data());
+
+    KP_LOG_DEBUG("Kompute Algorithm creating descriptor set layout");
+    this->mDescriptorSetLayout = std::make_shared<vk::DescriptorSetLayout>();
+    vk::Result result = this->mDevice->createDescriptorSetLayout(
+      &descriptorSetLayoutInfo, nullptr, this->mDescriptorSetLayout.get());
+
+   if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to create descriptor set layout. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSetLayout = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor set layout.");
+    }
+
+    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
+      *this->mDescriptorPool,
+      1, // Descriptor set layout count
+      this->mDescriptorSetLayout.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
+    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
+    result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
+                                          this->mDescriptorSet.get());
+
+    if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSet = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
+    }
+
+    this->mFreeDescriptorSet = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
+
+        vk::DescriptorBufferInfo descriptorBufferInfo =
+          this->mTensors[i]->constructDescriptorBufferInfo();
+
+        computeWriteDescriptorSets.push_back(
+          vk::WriteDescriptorSet(*this->mDescriptorSet,
+                                 i, // Destination binding
+                                 0, // Destination array element
+                                 1, // Descriptor count
+                                 vk::DescriptorType::eStorageBuffer,
+                                 nullptr, // Descriptor image info
+                                 &descriptorBufferInfo));
+
+        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
+                                            nullptr);
+    }
+
+    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
+}
+
+void
+Algorithm::updateParameters()
+{
+    KP_LOG_DEBUG("Kompute Algorithm updateParameters started");
+    if (!*this->mDescriptorPool) {
+        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
+        return;
+    }
+
+    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
+      *this->mDescriptorPool,
+      1, // Descriptor set layout count
+      this->mDescriptorSetLayout.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
+    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
+    vk::Result result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
+                                          this->mDescriptorSet.get());
+
+    if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSet = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
+    }
+
+    this->mFreeDescriptorSet = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
+
+        vk::DescriptorBufferInfo descriptorBufferInfo =
+          this->mTensors[i]->constructDescriptorBufferInfo();
+
+        computeWriteDescriptorSets.push_back(
+          vk::WriteDescriptorSet(*this->mDescriptorSet,
+                                 i, // Destination binding
+                                 0, // Destination array element
+                                 1, // Descriptor count
+                                 vk::DescriptorType::eStorageBuffer,
+                                 nullptr, // Descriptor image info
+                                 &descriptorBufferInfo));
+
+        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
+                                            nullptr);
+    }
+
+    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
+}
+
+void
+Algorithm::createShaderModule()
+{
+    KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");
+
+    vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
+                                                sizeof(uint32_t) *
+                                                  this->mSpirv.size(),
+                                                this->mSpirv.data());
+
+    KP_LOG_DEBUG("Kompute Algorithm Creating shader module. ShaderFileSize: {}",
+                 this->mSpirv.size());
+    this->mFreeShaderModule = true;
+    this->mShaderModule = std::make_shared<vk::ShaderModule>();
+    this->mDevice->createShaderModule(
+      &shaderModuleInfo, nullptr, this->mShaderModule.get());
+    this->mFreeShaderModule = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm create shader module success");
+}
+
+void
+Algorithm::createPipeline()
+{
+    KP_LOG_DEBUG("Kompute Algorithm calling create Pipeline");
+
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo(
+      vk::PipelineLayoutCreateFlags(),
+      1, // Set layout count
+      this->mDescriptorSetLayout.get());
+
+    vk::PushConstantRange pushConstantRange;
+    if (this->mPushConstantsSize) {
+        pushConstantRange.setStageFlags(vk::ShaderStageFlagBits::eCompute);
+        pushConstantRange.setOffset(0);
+        pushConstantRange.setSize(this->mPushConstantsDataTypeMemorySize *
+                                  this->mPushConstantsSize);
+
+        pipelineLayoutInfo.setPushConstantRangeCount(1);
+        pipelineLayoutInfo.setPPushConstantRanges(&pushConstantRange);
+    }
+
+    this->mPipelineLayout = std::make_shared<vk::PipelineLayout>();
+    this->mDevice->createPipelineLayout(
+      &pipelineLayoutInfo, nullptr, this->mPipelineLayout.get());
+    this->mFreePipelineLayout = true;
+
+    std::vector<vk::SpecializationMapEntry> specializationEntries;
+
+    for (uint32_t i = 0; i < this->mSpecializationConstantsSize; i++) {
+        vk::SpecializationMapEntry specializationEntry(
+          static_cast<uint32_t>(i),
+          static_cast<uint32_t>(
+            this->mSpecializationConstantsDataTypeMemorySize * i),
+          this->mSpecializationConstantsDataTypeMemorySize);
+
+        specializationEntries.push_back(specializationEntry);
+    }
+
+    // This passes ownership of the memory so we remove ownership from
+    // specialization container by using "transferDataOwnership"
+    vk::SpecializationInfo specializationInfo(
+      static_cast<uint32_t>(specializationEntries.size()),
+      specializationEntries.data(),
+      this->mSpecializationConstantsDataTypeMemorySize *
+        this->mSpecializationConstantsSize,
+      this->mSpecializationConstantsData);
+
+    vk::PipelineShaderStageCreateInfo shaderStage(
+      vk::PipelineShaderStageCreateFlags(),
+      vk::ShaderStageFlagBits::eCompute,
+      *this->mShaderModule,
+      "main",
+      &specializationInfo);
+
+    static std::shared_ptr<vk::PipelineCache> globalPipelineCache = std::make_shared<vk::PipelineCache>();
+    if(!*globalPipelineCache) {
+       vk::PipelineCacheCreateInfo pipelineCacheInfo =
+         vk::PipelineCacheCreateInfo();
+      this->mPipelineCache = globalPipelineCache;
+      this->mFreePipelineCache = true;
+      this->mDevice->createPipelineCache(
+        &pipelineCacheInfo, nullptr, globalPipelineCache.get());
+    }
+
+    vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
+                                               shaderStage,
+                                               *this->mPipelineLayout,
+                                               vk::Pipeline(),
+                                               0);
+
+#ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
+    vk::ResultValue<vk::Pipeline> pipelineResult =
+      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo);
+
+    if (pipelineResult.result != vk::Result::eSuccess) {
+        throw std::runtime_error("Failed to create pipeline result: " +
+                                 vk::to_string(pipelineResult.result));
+    }
+
+    vk::Pipeline& pipeline = pipelineResult.value;
+    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
+    this->mFreePipeline = true;
+#else
+    vk::Pipeline pipeline =
+      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo)
+        .value;
+    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
+    this->mFreePipeline = true;
+#endif
+
+    // TODO: Update to consistent
+    // this->mPipeline = std::make_shared<vk::Pipeline>();
+    // this->mDevice->createComputePipelines(
+    //         *this->mPipelineCache, 1, &pipelineInfo, nullptr,
+    //         this->mPipeline.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm Create Pipeline Success");
+}
+
+void
+Algorithm::recordBindCore(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute Algorithm binding pipeline");
+
+    commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute,
+                               *this->mPipeline);
+
+    KP_LOG_DEBUG("Kompute Algorithm binding descriptor sets");
+
+    commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+                                     *this->mPipelineLayout,
+                                     0, // First set
+                                     *this->mDescriptorSet,
+                                     nullptr // Dispatcher
+    );
+}
+
+void
+Algorithm::recordBindPush(const vk::CommandBuffer& commandBuffer)
+{
+    if (this->mPushConstantsSize) {
+        KP_LOG_DEBUG("Kompute Algorithm binding push constants memory size: {}",
+                     this->mPushConstantsSize *
+                       this->mPushConstantsDataTypeMemorySize);
+
+        commandBuffer.pushConstants(*this->mPipelineLayout,
+                                    vk::ShaderStageFlagBits::eCompute,
+                                    0,
+                                    this->mPushConstantsSize *
+                                      this->mPushConstantsDataTypeMemorySize,
+                                    this->mPushConstantsData);
+    }
+}
+
+void
+Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute Algorithm recording dispatch");
+
+    commandBuffer.dispatch(
+      this->mWorkgroup[0], this->mWorkgroup[1], this->mWorkgroup[2]);
+}
+
+void
+Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
+{
+
+    KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
+
+    // The dispatch size is set up based on either explicitly provided template
+    // parameters or by default it would take the shape and size of the tensors
+    if (workgroup[0] > 0) {
+        // If at least the x value is provided we use mainly the parameters
+        // provided
+        this->mWorkgroup = { workgroup[0],
+                             workgroup[1] > 0 ? workgroup[1] : 1,
+                             workgroup[2] > 0 ? workgroup[2] : 1 };
+    } else {
+        this->mWorkgroup = { minSize, 1, 1 };
+    }
+
+    KP_LOG_INFO("Kompute OpAlgoCreate set dispatch size X: {}, Y: {}, Z: {}",
+                this->mWorkgroup[0],
+                this->mWorkgroup[1],
+                this->mWorkgroup[2]);
+}
+
+const Workgroup&
+Algorithm::getWorkgroup()
+{
+    return this->mWorkgroup;
+}
+
+const std::vector<std::shared_ptr<Tensor>>&
+Algorithm::getTensors()
+{
+    return this->mTensors;
+}
+
+void Algorithm::setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    this->mTensors = tensors;
+}
+
+}
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+if(KOMPUTE_OPT_ANDROID_BUILD)
+    find_library(android android)
+endif()
+
+cmake_minimum_required(VERSION 3.20)
+
+add_library(kompute Algorithm.cpp
+    Manager.cpp
+    OpAlgoDispatch.cpp
+    OpMemoryBarrier.cpp
+    OpTensorCopy.cpp
+    OpTensorSyncDevice.cpp
+    OpTensorSyncLocal.cpp
+    OpBufferSyncDevice.cpp
+    OpBufferSyncLocal.cpp
+    Sequence.cpp
+    Tensor.cpp
+    Core.cpp)
+
+add_library(kompute::kompute ALIAS kompute)
+
+# Set version for shared libraries.
+set_target_properties(kompute
+    PROPERTIES
+    VERSION ${${PROJECT_NAME}_VERSION}
+    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR})
+
+# Import GNU common install directory variables
+include(GNUInstallDirs)
+
+install(TARGETS kompute
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+# Include CMake helpers for package config files
+# Follow this installation guideline: https://cmake.org/cmake/help/latest/manual/cmake-packages.7.html
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
+    "${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+
+install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
+    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+
+# ####################################################
+# Linking
+# ####################################################
+if(KOMPUTE_OPT_ANDROID_BUILD)
+    target_link_libraries(kompute PUBLIC vulkanAndroid
+        android
+        kp_logger
+        kp_shader
+        fmt::fmt)
+else()
+    target_link_libraries(kompute PUBLIC Vulkan::Vulkan
+        kp_logger
+        kp_shader
+        fmt::fmt)
+endif()
+
+if(KOMPUTE_OPT_BUILD_PYTHON)
+    include_directories(${PYTHON_INCLUDE_DIRS})
+
+    target_link_libraries(kompute PRIVATE pybind11::headers ${PYTHON_LIBRARIES})
+endif()
+
+if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
+    target_link_libraries(kompute PUBLIC Vulkan-Headers)
+endif()
+
+# ####################################################
+# Misc
+# ####################################################
+add_subdirectory(logger)
+add_subdirectory(shaders)
+add_subdirectory(include)
--- a/kompute/src/Core.cpp
+++ b/kompute/src/Core.cpp
@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Core.hpp"
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+#ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+/**
+ * Ensures support for dynamic loading of Vulkan functions on Android.
+ * Acts as a default store for loaded functions.
+ * More information:
+ * https://github.com/KhronosGroup/Vulkan-Hpp#vulkan_hpp_default_dispatcher
+ **/
+VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+namespace kp {
+} // namespace kp
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@ -0,0 +1,493 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Manager.hpp"
+#include "fmt/format.h"
+#include "kompute/logger/Logger.hpp"
+#include <fmt/core.h>
+#include <iterator>
+#include <set>
+#include <sstream>
+#include <string>
+
+namespace kp {
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+static VKAPI_ATTR VkBool32 VKAPI_CALL
+debugMessageCallback(VkDebugReportFlagsEXT /*flags*/,
+                     VkDebugReportObjectTypeEXT /*objectType*/,
+                     uint64_t /*object*/,
+                     size_t /*location*/,
+                     int32_t /*messageCode*/,
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
+                     const char* pLayerPrefix,
+                     const char* pMessage,
+#else
+                     const char* /*pLayerPrefix*/,
+                     const char* /*pMessage*/,
+#endif
+                     void* /*pUserData*/)
+{
+    KP_LOG_DEBUG("[VALIDATION]: {} - {}", pLayerPrefix, pMessage);
+    return VK_FALSE;
+}
+#endif
+
+Manager::Manager()
+{
+    this->mManageResources = true;
+
+// Make sure the logger is setup
+#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
+    logger::setupLogger();
+#endif
+    this->createInstance();
+}
+
+void Manager::initializeDevice(uint32_t physicalDeviceIndex,
+                               const std::vector<uint32_t>& familyQueueIndices,
+                               const std::vector<std::string>& desiredExtensions)
+{
+    this->createDevice(
+      familyQueueIndices, physicalDeviceIndex, desiredExtensions);
+}
+
+Manager::~Manager()
+{
+    KP_LOG_DEBUG("Kompute Manager Destructor started");
+    this->destroy();
+}
+
+void
+Manager::destroy()
+{
+
+    KP_LOG_DEBUG("Kompute Manager destroy() started");
+
+    if (this->mDevice == nullptr) {
+        KP_LOG_ERROR(
+          "Kompute Manager destructor reached with null Device pointer");
+        return;
+    }
+
+    if (this->mManageResources && this->mManagedSequences.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly running destructor for "
+                     "managed sequences");
+        for (const std::weak_ptr<Sequence>& weakSq : this->mManagedSequences) {
+            if (std::shared_ptr<Sequence> sq = weakSq.lock()) {
+                sq->destroy();
+            }
+        }
+        this->mManagedSequences.clear();
+    }
+
+    if (this->mManageResources && this->mManagedAlgorithms.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
+        for (const std::weak_ptr<Algorithm>& weakAlgorithm :
+             this->mManagedAlgorithms) {
+            if (std::shared_ptr<Algorithm> algorithm = weakAlgorithm.lock()) {
+                algorithm->destroy();
+            }
+        }
+        this->mManagedAlgorithms.clear();
+    }
+
+    if (this->mManageResources && this->mManagedTensors.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly freeing tensors");
+        for (const std::weak_ptr<Tensor>& weakTensor : this->mManagedTensors) {
+            if (std::shared_ptr<Tensor> tensor = weakTensor.lock()) {
+                tensor->destroy();
+            }
+        }
+        this->mManagedTensors.clear();
+    }
+
+    if (this->mFreeDevice) {
+        KP_LOG_INFO("Destroying device");
+        this->mDevice->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice = nullptr;
+        KP_LOG_DEBUG("Kompute Manager Destroyed Device");
+    }
+
+    if (this->mInstance == nullptr) {
+        KP_LOG_ERROR(
+          "Kompute Manager destructor reached with null Instance pointer");
+        return;
+    }
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    if (this->mDebugReportCallback) {
+        this->mInstance->destroyDebugReportCallbackEXT(
+          this->mDebugReportCallback, nullptr, this->mDebugDispatcher);
+        KP_LOG_DEBUG("Kompute Manager Destroyed Debug Report Callback");
+    }
+#endif
+
+    if (this->mFreeInstance) {
+        this->mInstance->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mInstance = nullptr;
+        KP_LOG_DEBUG("Kompute Manager Destroyed Instance");
+    }
+}
+
+void
+Manager::createInstance()
+{
+
+    KP_LOG_DEBUG("Kompute Manager creating instance");
+
+    this->mFreeInstance = true;
+
+    vk::ApplicationInfo applicationInfo;
+    applicationInfo.pApplicationName = "Kompute";
+    applicationInfo.pEngineName = "Kompute";
+    applicationInfo.apiVersion = KOMPUTE_VK_API_VERSION;
+    applicationInfo.engineVersion = KOMPUTE_VK_API_VERSION;
+    applicationInfo.applicationVersion = KOMPUTE_VK_API_VERSION;
+
+    std::vector<const char*> applicationExtensions;
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
+#endif
+
+    vk::InstanceCreateInfo computeInstanceCreateInfo;
+    computeInstanceCreateInfo.pApplicationInfo = &applicationInfo;
+    if (!applicationExtensions.empty()) {
+        computeInstanceCreateInfo.enabledExtensionCount =
+          (uint32_t)applicationExtensions.size();
+        computeInstanceCreateInfo.ppEnabledExtensionNames =
+          applicationExtensions.data();
+    }
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
+    // We'll identify the layers that are supported
+    std::vector<const char*> validLayerNames;
+    std::vector<const char*> desiredLayerNames = {
+        "VK_LAYER_LUNARG_assistant_layer",
+        "VK_LAYER_LUNARG_standard_validation",
+        "VK_LAYER_KHRONOS_validation",
+    };
+    std::vector<std::string> envLayerNames;
+    const char* envLayerNamesVal = std::getenv("KOMPUTE_ENV_DEBUG_LAYERS");
+    if (envLayerNamesVal != nullptr && *envLayerNamesVal != '\0') {
+        KP_LOG_DEBUG("Kompute Manager adding environment layers: {}",
+                     envLayerNamesVal);
+        std::istringstream iss(envLayerNamesVal);
+        std::istream_iterator<std::string> beg(iss);
+        std::istream_iterator<std::string> end;
+        envLayerNames = std::vector<std::string>(beg, end);
+        for (const std::string& layerName : envLayerNames) {
+            desiredLayerNames.push_back(layerName.c_str());
+        }
+        KP_LOG_DEBUG("Desired layers: {}", fmt::join(desiredLayerNames, ", "));
+    }
+
+    // Identify the valid layer names based on the desiredLayerNames
+    {
+        std::set<std::string> uniqueLayerNames;
+        std::vector<vk::LayerProperties> availableLayerProperties =
+          vk::enumerateInstanceLayerProperties();
+        for (vk::LayerProperties layerProperties : availableLayerProperties) {
+            std::string layerName(layerProperties.layerName.data());
+            uniqueLayerNames.insert(layerName);
+        }
+        KP_LOG_DEBUG("Available layers: {}", fmt::join(uniqueLayerNames, ", "));
+        for (const char* desiredLayerName : desiredLayerNames) {
+            if (uniqueLayerNames.count(desiredLayerName) != 0) {
+                validLayerNames.push_back(desiredLayerName);
+            }
+        }
+    }
+
+    if (!validLayerNames.empty()) {
+        KP_LOG_DEBUG(
+          "Kompute Manager Initializing instance with valid layers: {}",
+          fmt::join(validLayerNames, ", "));
+        computeInstanceCreateInfo.enabledLayerCount =
+          static_cast<uint32_t>(validLayerNames.size());
+        computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data();
+    } else {
+        KP_LOG_WARN("Kompute Manager no valid layer names found from desired "
+                    "layer names");
+    }
+#endif
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+    vk::DynamicLoader dl;
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
+      dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+    this->mInstance = std::make_shared<vk::Instance>();
+    vk::createInstance(
+      &computeInstanceCreateInfo, nullptr, this->mInstance.get());
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+    KP_LOG_DEBUG("Kompute Manager Instance Created");
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    KP_LOG_DEBUG("Kompute Manager adding debug callbacks");
+    if (validLayerNames.size() > 0) {
+        vk::DebugReportFlagsEXT debugFlags =
+          vk::DebugReportFlagBitsEXT::eError |
+          vk::DebugReportFlagBitsEXT::eWarning;
+        vk::DebugReportCallbackCreateInfoEXT debugCreateInfo = {};
+        debugCreateInfo.pfnCallback =
+          (PFN_vkDebugReportCallbackEXT)debugMessageCallback;
+        debugCreateInfo.flags = debugFlags;
+
+        this->mDebugDispatcher.init(*this->mInstance, &vkGetInstanceProcAddr);
+        this->mDebugReportCallback =
+          this->mInstance->createDebugReportCallbackEXT(
+            debugCreateInfo, nullptr, this->mDebugDispatcher);
+    }
+#endif
+}
+
+void
+Manager::clear()
+{
+    if (this->mManageResources) {
+        this->mManagedTensors.erase(
+          std::remove_if(begin(this->mManagedTensors),
+                         end(this->mManagedTensors),
+                         [](std::weak_ptr<Tensor> t) { return t.expired(); }),
+          end(this->mManagedTensors));
+        this->mManagedAlgorithms.erase(
+          std::remove_if(
+            begin(this->mManagedAlgorithms),
+            end(this->mManagedAlgorithms),
+            [](std::weak_ptr<Algorithm> t) { return t.expired(); }),
+          end(this->mManagedAlgorithms));
+        this->mManagedSequences.erase(
+          std::remove_if(begin(this->mManagedSequences),
+                         end(this->mManagedSequences),
+                         [](std::weak_ptr<Sequence> t) { return t.expired(); }),
+          end(this->mManagedSequences));
+    }
+}
+
+void
+Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
+                      uint32_t physicalDeviceIndex,
+                      const std::vector<std::string>& desiredExtensions)
+{
+
+    KP_LOG_DEBUG("Kompute Manager creating Device");
+
+    if (this->mInstance == nullptr) {
+        throw std::runtime_error("Kompute Manager instance is null");
+    }
+
+    this->mFreeDevice = true;
+
+    // Getting an integer that says how many vuklan devices we have
+    std::vector<vk::PhysicalDevice> physicalDevices =
+      this->mInstance->enumeratePhysicalDevices();
+    uint32_t deviceCount = physicalDevices.size();
+
+    // This means there are no devices at all
+    if (deviceCount == 0) {
+        throw std::runtime_error("Failed to find GPUs with Vulkan support! "
+                                 "Maybe you haven't installed vulkan drivers?");
+    }
+
+    // This means that we're exceeding our device limit, for
+    // example if we have 2 devices, just physicalDeviceIndex
+    // 0 and 1 are acceptable. Hence, physicalDeviceIndex should
+    // always be less than deviceCount, else we raise an error
+    if (!(deviceCount > physicalDeviceIndex)) {
+        throw std::runtime_error("There is no such physical index or device, "
+                                 "please use your existing device");
+    }
+
+    vk::PhysicalDevice physicalDevice = physicalDevices[physicalDeviceIndex];
+
+    this->mPhysicalDevice =
+      std::make_shared<vk::PhysicalDevice>(physicalDevice);
+
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      physicalDevice.getProperties();
+#endif
+
+    KP_LOG_INFO("Using physical device index {} found {}",
+                physicalDeviceIndex,
+                physicalDeviceProperties.deviceName);
+
+    if (familyQueueIndices.empty()) {
+        // Find compute queue
+        std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
+          physicalDevice.getQueueFamilyProperties();
+
+        uint32_t computeQueueFamilyIndex = 0;
+        bool computeQueueSupported = false;
+        for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
+            vk::QueueFamilyProperties queueFamilyProperties =
+              allQueueFamilyProperties[i];
+
+            if (queueFamilyProperties.queueFlags &
+                vk::QueueFlagBits::eCompute) {
+                computeQueueFamilyIndex = i;
+                computeQueueSupported = true;
+                break;
+            }
+        }
+
+        if (!computeQueueSupported) {
+            throw std::runtime_error("Compute queue is not supported");
+        }
+
+        this->mComputeQueueFamilyIndices.push_back(computeQueueFamilyIndex);
+    } else {
+        this->mComputeQueueFamilyIndices = familyQueueIndices;
+    }
+
+    std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
+    std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
+    for (const auto& value : this->mComputeQueueFamilyIndices) {
+        familyQueueCounts[value]++;
+        familyQueuePriorities[value].push_back(1.0f);
+    }
+
+    std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
+    std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
+    for (const auto& familyQueueInfo : familyQueueCounts) {
+        // Setting the device count to 0
+        familyQueueIndexCount[familyQueueInfo.first] = 0;
+
+        // Creating the respective device queue
+        vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
+          vk::DeviceQueueCreateFlags(),
+          familyQueueInfo.first,
+          familyQueueInfo.second,
+          familyQueuePriorities[familyQueueInfo.first].data());
+        deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
+    }
+
+    KP_LOG_DEBUG("Kompute Manager desired extension layers {}",
+                 fmt::join(desiredExtensions, ", "));
+
+    std::vector<vk::ExtensionProperties> deviceExtensions =
+      this->mPhysicalDevice->enumerateDeviceExtensionProperties();
+
+    std::set<std::string> uniqueExtensionNames;
+    for (const vk::ExtensionProperties& ext : deviceExtensions) {
+        uniqueExtensionNames.insert(ext.extensionName);
+    }
+    KP_LOG_DEBUG("Kompute Manager available extensions {}",
+                 fmt::join(uniqueExtensionNames, ", "));
+    std::vector<const char*> validExtensions;
+    for (const std::string& ext : desiredExtensions) {
+        if (uniqueExtensionNames.count(ext) != 0) {
+            validExtensions.push_back(ext.c_str());
+        }
+    }
+    if (desiredExtensions.size() != validExtensions.size()) {
+        KP_LOG_ERROR("Kompute Manager not all extensions were added: {}",
+                     fmt::join(validExtensions, ", "));
+    }
+
+    vk::PhysicalDeviceFeatures features;
+    features.shaderInt16 = true;
+
+    vk::PhysicalDeviceVulkan11Features features11;
+    features11.uniformAndStorageBuffer16BitAccess = true;
+    features11.storageBuffer16BitAccess = true;
+    features11.pNext = nullptr;
+
+    vk::PhysicalDeviceVulkan12Features features12;
+    features12.storageBuffer8BitAccess = true;
+    features12.uniformAndStorageBuffer8BitAccess = true;
+    features12.shaderFloat16 = true;
+    features12.shaderInt8 = true;
+    features12.pNext = &features11;
+
+    vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
+                                          deviceQueueCreateInfos.size(),
+                                          deviceQueueCreateInfos.data(),
+                                          {},
+                                          {},
+                                          validExtensions.size(),
+                                          validExtensions.data(),
+                                          &features);
+
+    deviceCreateInfo.pNext = &features12;
+
+    this->mDevice = std::make_shared<vk::Device>();
+    vk::Result r = physicalDevice.createDevice(
+      &deviceCreateInfo, nullptr, this->mDevice.get());
+    if (r != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Kompute Manager could not create device");
+    }
+
+    KP_LOG_DEBUG("Kompute Manager device created");
+
+    for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndices) {
+        std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
+
+        this->mDevice->getQueue(familyQueueIndex,
+                                familyQueueIndexCount[familyQueueIndex],
+                                currQueue.get());
+
+        familyQueueIndexCount[familyQueueIndex]++;
+
+        this->mComputeQueues.push_back(currQueue);
+    }
+
+    KP_LOG_DEBUG("Kompute Manager compute queue obtained");
+}
+
+std::shared_ptr<Sequence>
+Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
+
+    std::shared_ptr<Sequence> sq{ new kp::Sequence(
+      this->mPhysicalDevice,
+      this->mDevice,
+      this->mComputeQueues[queueIndex],
+      this->mComputeQueueFamilyIndices[queueIndex],
+      totalTimestamps) };
+
+    if (this->mManageResources) {
+        this->mManagedSequences.push_back(sq);
+    }
+
+    return sq;
+}
+
+vk::PhysicalDeviceProperties
+Manager::getDeviceProperties() const
+{
+    return this->mPhysicalDevice->getProperties();
+}
+
+std::vector<vk::PhysicalDevice>
+Manager::listDevices() const
+{
+    return this->mInstance->enumeratePhysicalDevices();
+}
+
+std::shared_ptr<vk::Instance>
+Manager::getVkInstance() const
+{
+    return this->mInstance;
+}
+
+}
--- a/kompute/src/OpAlgoDispatch.cpp
+++ b/kompute/src/OpAlgoDispatch.cpp
@ -0,0 +1,65 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+
+namespace kp {
+
+OpAlgoDispatch::~OpAlgoDispatch()
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch destructor started");
+
+    if (this->mPushConstantsData) {
+        KP_LOG_DEBUG("Kompute freeing push constants data");
+        free(this->mPushConstantsData);
+    }
+}
+
+void
+OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (const std::shared_ptr<Tensor>& tensor :
+         this->mAlgorithm->getTensors()) {
+        tensor->recordPrimaryBufferMemoryBarrier(
+          commandBuffer,
+          vk::AccessFlagBits::eTransferWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eTransfer,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    if (this->mPushConstantsSize) {
+        this->mAlgorithm->setPushConstants(
+          this->mPushConstantsData,
+          this->mPushConstantsSize,
+          this->mPushConstantsDataTypeMemorySize);
+    }
+
+    this->mAlgorithm->recordBindCore(commandBuffer);
+    this->mAlgorithm->recordBindPush(commandBuffer);
+    this->mAlgorithm->recordDispatch(commandBuffer);
+}
+
+void
+OpAlgoDispatch::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch preEval called");
+}
+
+void
+OpAlgoDispatch::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch postSubmit called");
+}
+
+}
--- a/kompute/src/OpBufferSyncDevice.cpp
+++ b/kompute/src/OpBufferSyncDevice.cpp
@ -0,0 +1,51 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpBufferSyncDevice.hpp"
+
+namespace kp {
+
+OpBufferSyncDevice::OpBufferSyncDevice(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size)
+  : mPrimaryBuffer(primaryBuffer)
+  , mStagingBuffer(stagingBuffer)
+  , mSize(size)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice constructor with params");
+}
+
+OpBufferSyncDevice::~OpBufferSyncDevice()
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice destructor started");
+}
+
+void
+OpBufferSyncDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice record called");
+    vk::BufferCopy copyRegion(0, 0, mSize);
+    commandBuffer.copyBuffer(*mStagingBuffer, *mPrimaryBuffer, copyRegion);
+}
+
+void
+OpBufferSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice preEval called");
+}
+
+void
+OpBufferSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice postEval called");
+}
+
+}
--- a/kompute/src/OpBufferSyncLocal.cpp
+++ b/kompute/src/OpBufferSyncLocal.cpp
@ -0,0 +1,51 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpBufferSyncLocal.hpp"
+
+namespace kp {
+
+OpBufferSyncLocal::OpBufferSyncLocal(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size)
+  : mPrimaryBuffer(primaryBuffer)
+  , mStagingBuffer(stagingBuffer)
+  , mSize(size)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal constructor with params");
+}
+
+OpBufferSyncLocal::~OpBufferSyncLocal()
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal destructor started");
+}
+
+void
+OpBufferSyncLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal record called");
+    vk::BufferCopy copyRegion(0, 0, mSize);
+    commandBuffer.copyBuffer(*mPrimaryBuffer, *mStagingBuffer, copyRegion);
+}
+
+void
+OpBufferSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal preEval called");
+}
+
+void
+OpBufferSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal postEval called");
+}
+
+}
--- a/kompute/src/OpMemoryBarrier.cpp
+++ b/kompute/src/OpMemoryBarrier.cpp
@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpMemoryBarrier.hpp"
+
+namespace kp {
+
+OpMemoryBarrier::OpMemoryBarrier(
+  const std::vector<std::shared_ptr<Tensor>>& tensors,
+  const vk::AccessFlagBits& srcAccessMask,
+  const vk::AccessFlagBits& dstAccessMask,
+  const vk::PipelineStageFlagBits& srcStageMask,
+  const vk::PipelineStageFlagBits& dstStageMask,
+  bool barrierOnPrimary)
+  : mSrcAccessMask(srcAccessMask)
+  , mDstAccessMask(dstAccessMask)
+  , mSrcStageMask(srcStageMask)
+  , mDstStageMask(dstStageMask)
+  , mBarrierOnPrimary(barrierOnPrimary)
+  , mTensors(tensors)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor");
+}
+
+OpMemoryBarrier::~OpMemoryBarrier()
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started");
+}
+
+void
+OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    if (this->mBarrierOnPrimary) {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordPrimaryBufferMemoryBarrier(commandBuffer,
+                                                     this->mSrcAccessMask,
+                                                     this->mDstAccessMask,
+                                                     this->mSrcStageMask,
+                                                     this->mDstStageMask);
+        }
+    } else {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordStagingBufferMemoryBarrier(commandBuffer,
+                                                     this->mSrcAccessMask,
+                                                     this->mDstAccessMask,
+                                                     this->mSrcStageMask,
+                                                     this->mDstStageMask);
+        }
+    }
+}
+
+void
+OpMemoryBarrier::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called");
+}
+
+void
+OpMemoryBarrier::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called");
+}
+
+}
--- a/kompute/src/OpTensorCopy.cpp
+++ b/kompute/src/OpTensorCopy.cpp
@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorCopy.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+OpTensorCopy::OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy constructor with params");
+
+    this->mTensors = tensors;
+
+    if (this->mTensors.size() < 2) {
+        throw std::runtime_error(
+          "Kompute OpTensorCopy called with less than 2 tensor");
+    }
+
+    kp::Tensor::TensorDataTypes dataType = this->mTensors[0]->dataType();
+    uint32_t size = this->mTensors[0]->size();
+    for (const std::shared_ptr<Tensor>& tensor : tensors) {
+        if (tensor->dataType() != dataType) {
+            throw std::runtime_error(fmt::format(
+              "Attempting to copy tensors of different types from {} to {}",
+              Tensor::toString(dataType),
+              Tensor::toString(tensor->dataType())));
+        }
+        if (tensor->size() != size) {
+            throw std::runtime_error(fmt::format(
+              "Attempting to copy tensors of different sizes from {} to {}",
+              size,
+              tensor->size()));
+        }
+    }
+}
+
+OpTensorCopy::~OpTensorCopy()
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy destructor started");
+}
+
+void
+OpTensorCopy::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy record called");
+
+    // We iterate from the second tensor onwards and record a copy to all
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->recordCopyFrom(commandBuffer, this->mTensors[0]);
+    }
+}
+
+void
+OpTensorCopy::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy preEval called");
+}
+
+void
+OpTensorCopy::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");
+
+    // Do not copy on CPU side if source is storage tensor
+    if (this->mTensors[0]->tensorType() == kp::Tensor::TensorTypes::eStorage)
+    {
+        KP_LOG_DEBUG("Kompute OpTensorCopy not copying tensor source given it's of eStorage type");
+        return;
+    }
+    void* data = this->mTensors[0]->rawData();
+
+    // Copy the data from the first tensor into all the tensors
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == kp::Tensor::TensorTypes::eStorage) {
+            KP_LOG_DEBUG("Kompute OpTensorCopy not copying to tensor dest given it's of eStorage type");
+            continue;
+        }
+        this->mTensors[i]->setRawData(data);
+    }
+}
+
+}
--- a/kompute/src/OpTensorSyncDevice.cpp
+++ b/kompute/src/OpTensorSyncDevice.cpp
@ -0,0 +1,61 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorSyncDevice.hpp"
+
+namespace kp {
+
+OpTensorSyncDevice::OpTensorSyncDevice(
+  const std::vector<std::shared_ptr<Tensor>>& tensors)
+  : mPrimaryBuffer(nullptr)
+  , mStagingBuffer(nullptr)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncDevice called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorSyncDevice::~OpTensorSyncDevice()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
+
+    this->mTensors.clear();
+}
+
+void
+OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
+        }
+    }
+}
+
+void
+OpTensorSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
+}
+
+void
+OpTensorSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
+}
+
+}
--- a/kompute/src/OpTensorSyncLocal.cpp
+++ b/kompute/src/OpTensorSyncLocal.cpp
@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpTensorSyncLocal.hpp"
+
+namespace kp {
+
+OpTensorSyncLocal::OpTensorSyncLocal(
+  const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncLocal called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorSyncLocal::~OpTensorSyncLocal()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
+}
+
+void
+OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eShaderWrite,
+              vk::AccessFlagBits::eTransferRead,
+              vk::PipelineStageFlagBits::eComputeShader,
+              vk::PipelineStageFlagBits::eTransfer);
+
+            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eTransferWrite,
+              vk::AccessFlagBits::eHostRead,
+              vk::PipelineStageFlagBits::eTransfer,
+              vk::PipelineStageFlagBits::eHost);
+        }
+    }
+}
+
+void
+OpTensorSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
+}
+
+void
+OpTensorSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
+
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
+}
+
+}
--- a/kompute/src/Sequence.cpp
+++ b/kompute/src/Sequence.cpp
@ -0,0 +1,396 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Sequence.hpp"
+
+namespace kp {
+
+Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::Queue> computeQueue,
+                   uint32_t queueIndex,
+                   uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
+
+    this->mPhysicalDevice = physicalDevice;
+    this->mDevice = device;
+    this->mComputeQueue = computeQueue;
+    this->mQueueIndex = queueIndex;
+
+    this->createCommandPool();
+    this->createCommandBuffer();
+    if (totalTimestamps > 0)
+        this->createTimestampQueryPool(totalTimestamps +
+                                       1); //+1 for the first one
+}
+
+Sequence::~Sequence()
+{
+    KP_LOG_DEBUG("Kompute Sequence Destructor started");
+
+    if (this->mDevice) {
+        this->destroy();
+    }
+}
+
+void
+Sequence::begin()
+{
+    KP_LOG_DEBUG("Kompute sequence called BEGIN");
+
+    if (this->isRecording()) {
+        KP_LOG_DEBUG("Kompute Sequence begin called when already recording");
+        return;
+    }
+
+    if (this->isRunning()) {
+        throw std::runtime_error(
+          "Kompute Sequence begin called when sequence still running");
+    }
+
+    KP_LOG_INFO("Kompute Sequence command now started recording");
+    this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
+    this->mRecording = true;
+
+    // latch the first timestamp before any commands are submitted
+    if (this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+          vk::PipelineStageFlagBits::eAllCommands,
+          *this->timestampQueryPool,
+          0);
+}
+
+void
+Sequence::end()
+{
+    KP_LOG_DEBUG("Kompute Sequence calling END");
+
+    if (this->isRunning()) {
+        throw std::runtime_error(
+          "Kompute Sequence begin called when sequence still running");
+    }
+
+    if (!this->isRecording()) {
+        KP_LOG_WARN("Kompute Sequence end called when not recording");
+        return;
+    } else {
+        KP_LOG_INFO("Kompute Sequence command recording END");
+        this->mCommandBuffer->end();
+        this->mRecording = false;
+    }
+}
+
+void
+Sequence::clear()
+{
+    KP_LOG_DEBUG("Kompute Sequence calling clear");
+    if (this->isRecording()) {
+        this->end();
+    }
+}
+
+std::shared_ptr<Sequence>
+Sequence::eval()
+{
+    KP_LOG_DEBUG("Kompute sequence EVAL BEGIN");
+
+    return this->evalAsync()->evalAwait();
+}
+
+std::shared_ptr<Sequence>
+Sequence::eval(std::shared_ptr<OpBase> op)
+{
+    this->clear();
+    return this->record(op)->eval();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync()
+{
+    if (this->isRecording()) {
+        this->end();
+    }
+
+    if (this->mIsRunning) {
+        throw std::runtime_error(
+          "Kompute Sequence evalAsync called when an eval async was "
+          "called without successful wait");
+    }
+
+    this->mIsRunning = true;
+
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->preEval(*this->mCommandBuffer);
+    }
+
+    vk::SubmitInfo submitInfo(
+      0, nullptr, nullptr, 1, this->mCommandBuffer.get());
+
+    this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
+
+    KP_LOG_DEBUG(
+      "Kompute sequence submitting command buffer into compute queue");
+
+    this->mComputeQueue->submit(1, &submitInfo, this->mFence);
+
+    return shared_from_this();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync(std::shared_ptr<OpBase> op)
+{
+    this->clear();
+    this->record(op);
+    this->evalAsync();
+    return shared_from_this();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAwait(uint64_t waitFor)
+{
+    if (!this->mIsRunning) {
+        KP_LOG_WARN("Kompute Sequence evalAwait called without existing eval");
+        return shared_from_this();
+    }
+
+    vk::Result result =
+      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
+    this->mDevice->destroy(
+      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+    this->mIsRunning = false;
+
+    if (result == vk::Result::eTimeout) {
+        KP_LOG_WARN("Kompute Sequence evalAwait reached timeout of {}",
+                    waitFor);
+        return shared_from_this();
+    }
+
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->postEval(*this->mCommandBuffer);
+    }
+
+    return shared_from_this();
+}
+
+bool
+Sequence::isRunning() const
+{
+    return this->mIsRunning;
+}
+
+bool
+Sequence::isRecording() const
+{
+    return this->mRecording;
+}
+
+bool
+Sequence::isInit() const
+{
+    return this->mDevice && this->mCommandPool && this->mCommandBuffer &&
+           this->mComputeQueue;
+}
+
+void
+Sequence::rerecord()
+{
+    this->end();
+    std::vector<std::shared_ptr<OpBase>> ops = this->mOperations;
+    this->mOperations.clear();
+    for (const std::shared_ptr<kp::OpBase>& op : ops) {
+        this->record(op);
+    }
+}
+
+void
+Sequence::destroy()
+{
+    KP_LOG_DEBUG("Kompute Sequence destroy called");
+
+    if (!this->mDevice) {
+        KP_LOG_WARN("Kompute Sequence destroy called "
+                    "with null Device pointer");
+        return;
+    }
+
+    if (this->mFreeCommandBuffer) {
+        KP_LOG_INFO("Freeing CommandBuffer");
+        if (!this->mCommandBuffer) {
+            KP_LOG_WARN("Kompute Sequence destroy called with null "
+                        "CommandPool pointer");
+            return;
+        }
+        this->mDevice->freeCommandBuffers(
+          *this->mCommandPool, 1, this->mCommandBuffer.get());
+
+        this->mCommandBuffer = nullptr;
+        this->mFreeCommandBuffer = false;
+
+        KP_LOG_DEBUG("Kompute Sequence Freed CommandBuffer");
+    }
+
+    if (this->mFreeCommandPool) {
+        KP_LOG_INFO("Destroying CommandPool");
+        if (this->mCommandPool == nullptr) {
+            KP_LOG_WARN("Kompute Sequence destroy called with null "
+                        "CommandPool pointer");
+            return;
+        }
+        this->mDevice->destroy(
+          *this->mCommandPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+        this->mCommandPool = nullptr;
+        this->mFreeCommandPool = false;
+
+        KP_LOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    }
+
+    if (this->mOperations.size()) {
+        KP_LOG_INFO("Kompute Sequence clearing operations buffer");
+        this->mOperations.clear();
+    }
+
+    if (this->timestampQueryPool) {
+        KP_LOG_INFO("Destroying QueryPool");
+        this->mDevice->destroy(
+          *this->timestampQueryPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+        this->timestampQueryPool = nullptr;
+        KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
+    }
+
+    if (this->mDevice) {
+        this->mDevice = nullptr;
+    }
+    if (this->mPhysicalDevice) {
+        this->mPhysicalDevice = nullptr;
+    }
+    if (this->mComputeQueue) {
+        this->mComputeQueue = nullptr;
+    }
+}
+
+std::shared_ptr<Sequence>
+Sequence::record(std::shared_ptr<OpBase> op)
+{
+    KP_LOG_DEBUG("Kompute Sequence record function started");
+
+    this->begin();
+
+    KP_LOG_DEBUG(
+      "Kompute Sequence running record on OpBase derived class instance");
+
+    op->record(*this->mCommandBuffer);
+
+    this->mOperations.push_back(op);
+
+    if (this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+          vk::PipelineStageFlagBits::eAllCommands,
+          *this->timestampQueryPool,
+          this->mOperations.size());
+
+    return shared_from_this();
+}
+
+void
+Sequence::createCommandPool()
+{
+    KP_LOG_DEBUG("Kompute Sequence creating command pool");
+
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Sequence device is null");
+    }
+
+    this->mFreeCommandPool = true;
+
+    vk::CommandPoolCreateInfo commandPoolInfo(vk::CommandPoolCreateFlags(),
+                                              this->mQueueIndex);
+    this->mCommandPool = std::make_shared<vk::CommandPool>();
+    this->mDevice->createCommandPool(
+      &commandPoolInfo, nullptr, this->mCommandPool.get());
+    KP_LOG_DEBUG("Kompute Sequence Command Pool Created");
+}
+
+void
+Sequence::createCommandBuffer()
+{
+    KP_LOG_DEBUG("Kompute Sequence creating command buffer");
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Sequence device is null");
+    }
+    if (!this->mCommandPool) {
+        throw std::runtime_error("Kompute Sequence command pool is null");
+    }
+
+    this->mFreeCommandBuffer = true;
+
+    vk::CommandBufferAllocateInfo commandBufferAllocateInfo(
+      *this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1);
+
+    this->mCommandBuffer = std::make_shared<vk::CommandBuffer>();
+    this->mDevice->allocateCommandBuffers(&commandBufferAllocateInfo,
+                                          this->mCommandBuffer.get());
+    KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
+}
+
+void
+Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence creating query pool");
+    if (!this->isInit()) {
+        throw std::runtime_error(
+          "createTimestampQueryPool() called on uninitialized Sequence");
+    }
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Sequence physical device is null");
+    }
+
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      this->mPhysicalDevice->getProperties();
+
+    if (physicalDeviceProperties.limits.timestampComputeAndGraphics) {
+        vk::QueryPoolCreateInfo queryPoolInfo;
+        queryPoolInfo.setQueryCount(totalTimestamps);
+        queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
+        this->timestampQueryPool = std::make_shared<vk::QueryPool>(
+          this->mDevice->createQueryPool(queryPoolInfo));
+
+        KP_LOG_DEBUG("Query pool for timestamps created");
+    } else {
+        throw std::runtime_error("Device does not support timestamps");
+    }
+}
+
+std::vector<std::uint64_t>
+Sequence::getTimestamps()
+{
+    if (!this->timestampQueryPool)
+        throw std::runtime_error("Timestamp latching not enabled");
+
+    const auto n = this->mOperations.size() + 1;
+    std::vector<std::uint64_t> timestamps(n, 0);
+    this->mDevice->getQueryPoolResults(
+      *this->timestampQueryPool,
+      0,
+      n,
+      timestamps.size() * sizeof(std::uint64_t),
+      timestamps.data(),
+      sizeof(uint64_t),
+      vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
+
+    return timestamps;
+}
+
+}
--- a/kompute/src/Tensor.cpp
+++ b/kompute/src/Tensor.cpp
@ -0,0 +1,451 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+std::string
+Tensor::toString(Tensor::TensorDataTypes dt)
+{
+    switch (dt) {
+        case TensorDataTypes::eBool:
+            return "eBool";
+        case TensorDataTypes::eInt:
+            return "eInt";
+        case TensorDataTypes::eUnsignedInt:
+            return "eUnsignedInt";
+        case TensorDataTypes::eFloat:
+            return "eFloat";
+        case TensorDataTypes::eDouble:
+            return "eDouble";
+        default:
+            return "unknown";
+    }
+}
+
+std::string
+Tensor::toString(Tensor::TensorTypes dt)
+{
+    switch (dt) {
+        case TensorTypes::eDevice:
+            return "eDevice";
+        case TensorTypes::eHost:
+            return "eHost";
+        case TensorTypes::eStorage:
+            return "eStorage";
+        default:
+            return "unknown";
+    }
+}
+
+Tensor::Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+               std::shared_ptr<vk::Device> device,
+               void* data,
+               uint32_t elementTotalCount,
+               uint32_t elementMemorySize,
+               const TensorDataTypes& dataType,
+               vk::DeviceMemory *primaryMemory,
+               vk::Buffer *primaryBuffer,
+               vk::DeviceMemory *stagingMemory,
+               vk::Buffer *stagingBuffer,
+               vk::DeviceSize offset,
+               const TensorTypes& tensorType)
+{
+    KP_LOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
+                 elementTotalCount,
+                 Tensor::toString(tensorType));
+
+    this->mPhysicalDevice = physicalDevice;
+    this->mDevice = device;
+    this->mDataType = dataType;
+    this->mTensorType = tensorType;
+
+    this->rebuild(data, elementTotalCount, elementMemorySize, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
+}
+
+Tensor::~Tensor()
+{
+    KP_LOG_DEBUG("Kompute Tensor destructor started. Type: {}",
+                 Tensor::toString(this->tensorType()));
+
+    if (this->mDevice) {
+        this->destroy();
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor destructor success");
+}
+
+void
+Tensor::rebuild(void* /*data*/,
+                uint32_t elementTotalCount,
+                uint64_t memorySize,
+                vk::DeviceMemory *primaryMemory,
+                vk::Buffer *primaryBuffer,
+                vk::DeviceMemory *stagingMemory,
+                vk::Buffer *stagingBuffer,
+                vk::DeviceSize offset)
+{
+    KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", elementTotalCount);
+
+    this->mSize = elementTotalCount;
+    this->mMemorySize = memorySize;
+    this->mOffset = offset;
+
+    if (this->mPrimaryBuffer || this->mPrimaryMemory) {
+        KP_LOG_DEBUG(
+          "Kompute Tensor destroying existing resources before rebuild");
+        this->destroy();
+    }
+
+    this->setGPUResources(primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
+}
+
+Tensor::TensorTypes
+Tensor::tensorType()
+{
+    return this->mTensorType;
+}
+
+bool
+Tensor::isInit()
+{
+    return this->mDevice && this->mPrimaryBuffer && this->mPrimaryMemory &&
+           this->mRawData;
+}
+
+uint32_t
+Tensor::size()
+{
+    return this->mSize;
+}
+
+uint64_t
+Tensor::memorySize()
+{
+    return this->mMemorySize;
+}
+
+kp::Tensor::TensorDataTypes
+Tensor::dataType()
+{
+    return this->mDataType;
+}
+
+void*
+Tensor::rawData()
+{
+    return this->mRawData;
+}
+
+void
+Tensor::setRawData(const void* data)
+{
+    memcpy(this->mRawData, data, this->memorySize());
+}
+
+void
+Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                       std::shared_ptr<Tensor> copyFromTensor)
+{
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           copyFromTensor->mPrimaryBuffer,
+                           this->mPrimaryBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           this->mStagingBuffer,
+                           this->mPrimaryBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           this->mPrimaryBuffer,
+                           this->mStagingBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
+                         vk::Buffer *bufferFrom,
+                         vk::Buffer *bufferTo,
+                         vk::DeviceSize /*bufferSize*/,
+                         vk::BufferCopy copyRegion)
+{
+
+    commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
+}
+
+void
+Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                         vk::AccessFlagBits srcAccessMask,
+                                         vk::AccessFlagBits dstAccessMask,
+                                         vk::PipelineStageFlagBits srcStageMask,
+                                         vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+                                    *this->mPrimaryBuffer,
+                                    srcAccessMask,
+                                    dstAccessMask,
+                                    srcStageMask,
+                                    dstStageMask);
+}
+
+void
+Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                         vk::AccessFlagBits srcAccessMask,
+                                         vk::AccessFlagBits dstAccessMask,
+                                         vk::PipelineStageFlagBits srcStageMask,
+                                         vk::PipelineStageFlagBits dstStageMask)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    KP_LOG_DEBUG("Kompute Tensor recording STAGING buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+                                    *this->mStagingBuffer,
+                                    srcAccessMask,
+                                    dstAccessMask,
+                                    srcStageMask,
+                                    dstStageMask);
+}
+
+void
+Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                  const vk::Buffer& buffer,
+                                  vk::AccessFlagBits srcAccessMask,
+                                  vk::AccessFlagBits dstAccessMask,
+                                  vk::PipelineStageFlagBits srcStageMask,
+                                  vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording buffer memory barrier");
+
+    vk::DeviceSize bufferSize = this->memorySize();
+
+    vk::BufferMemoryBarrier bufferMemoryBarrier;
+    bufferMemoryBarrier.buffer = buffer;
+    bufferMemoryBarrier.size = bufferSize;
+    bufferMemoryBarrier.srcAccessMask = srcAccessMask;
+    bufferMemoryBarrier.dstAccessMask = dstAccessMask;
+    bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+
+    commandBuffer.pipelineBarrier(srcStageMask,
+                                  dstStageMask,
+                                  vk::DependencyFlags(),
+                                  nullptr,
+                                  bufferMemoryBarrier,
+                                  nullptr);
+}
+
+vk::DescriptorBufferInfo
+Tensor::constructDescriptorBufferInfo()
+{
+    KP_LOG_DEBUG("Kompute Tensor construct descriptor buffer info size {}",
+                 this->memorySize());
+    vk::DeviceSize bufferSize = this->memorySize();
+    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
+                                    mOffset, // offset
+                                    bufferSize);
+}
+
+vk::BufferUsageFlags
+Tensor::getPrimaryBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        case TensorTypes::eHost:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        case TensorTypes::eStorage:
+            return vk::BufferUsageFlagBits::eStorageBuffer;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getPrimaryMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        case TensorTypes::eHost:
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
+            break;
+        case TensorTypes::eStorage:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::BufferUsageFlags
+Tensor::getStagingBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getStagingMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+void
+Tensor::setGPUResources(vk::DeviceMemory *primaryMemory,
+                        vk::Buffer *primaryBuffer,
+                        vk::DeviceMemory *stagingMemory,
+                        vk::Buffer *stagingBuffer,
+                        vk::DeviceSize /*offset*/)
+{
+    KP_LOG_DEBUG("Kompute Tensor creating buffer");
+
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Tensor phyisical device is null");
+    }
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Tensor device is null");
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor creating primary buffer and memory");
+
+    this->mPrimaryBuffer = primaryBuffer;
+    this->mPrimaryMemory = primaryMemory;
+
+    if (this->mTensorType == TensorTypes::eDevice) {
+        KP_LOG_DEBUG("Kompute Tensor creating staging buffer and memory");
+
+        this->mStagingBuffer = stagingBuffer;
+        this->mStagingMemory = stagingMemory;
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor buffer & memory creation successful");
+}
+
+void
+Tensor::destroy()
+{
+    KP_LOG_DEBUG("Kompute Tensor started destroy()");
+
+    // Setting raw data to null regardless whether device is available to
+    // invalidate Tensor
+    this->mRawData = nullptr;
+    this->mSize = 0;
+    this->mMemorySize = 0;
+
+    if (!this->mDevice) {
+        KP_LOG_WARN(
+          "Kompute Tensor destructor reached with null Device pointer");
+        return;
+    }
+
+    if (this->mDevice) {
+        this->mDevice = nullptr;
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor successful destroy()");
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<bool>::dataType()
+{
+    return Tensor::TensorDataTypes::eBool;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<int32_t>::dataType()
+{
+    return Tensor::TensorDataTypes::eInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<uint32_t>::dataType()
+{
+    return Tensor::TensorDataTypes::eUnsignedInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<float>::dataType()
+{
+    return Tensor::TensorDataTypes::eFloat;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<double>::dataType()
+{
+    return Tensor::TensorDataTypes::eDouble;
+}
+
+}
--- a/kompute/src/include/CMakeLists.txt
+++ b/kompute/src/include/CMakeLists.txt
@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.20)
+
+# ####################################################
+# Kompute
+# ####################################################
+target_include_directories(kompute PUBLIC $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+target_sources(kompute PRIVATE
+
+    # Header files (useful in IDEs)
+    kompute/Algorithm.hpp
+    kompute/Core.hpp
+    kompute/Kompute.hpp
+    kompute/Manager.hpp
+    kompute/Sequence.hpp
+    kompute/Tensor.hpp
+
+    kompute/operations/OpAlgoDispatch.hpp
+    kompute/operations/OpBase.hpp
+    kompute/operations/OpMemoryBarrier.hpp
+    kompute/operations/OpMult.hpp
+    kompute/operations/OpTensorCopy.hpp
+    kompute/operations/OpTensorSyncDevice.hpp
+    kompute/operations/OpTensorSyncLocal.hpp
+    kompute/operations/OpBufferSyncDevice.hpp
+    kompute/operations/OpBufferSyncLocal.hpp
+
+    kompute/logger/Logger.hpp
+)
+
+install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+# ####################################################
+# Logger
+# ####################################################
+target_include_directories(kp_logger PUBLIC $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+target_sources(kp_logger PRIVATE
+
+    # Header files (useful in IDEs)
+    kompute/logger/Logger.hpp
+)
+
+install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
--- a/kompute/src/include/kompute/Algorithm.hpp
+++ b/kompute/src/include/kompute/Algorithm.hpp
@ -0,0 +1,338 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "fmt/format.h"
+#include "kompute/Tensor.hpp"
+#include "logger/Logger.hpp"
+
+namespace kp {
+
+/**
+    Abstraction for compute shaders that are run on top of tensors grouped via
+   ParameterGroups (which group descriptorsets)
+*/
+class Algorithm
+{
+  public:
+    /**
+     *  Main constructor for algorithm with configuration parameters to create
+     *  the underlying resources.
+     *
+     *  @param device The Vulkan device to use for creating resources
+     *  @param tensors (optional) The tensors to use to create the descriptor
+     * resources
+     *  @param spirv (optional) The spirv code to use to create the algorithm
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The templatable param is to be
+     * used to initialize the specialization constants which cannot be changed
+     * once set.
+     *  @param pushConstants (optional) This templatable param is to be used
+     * when initializing the pipeline, which set the size of the push constants
+     * - these can be modified but all new values must have the same data type
+     * and length as otherwise it will result in errors.
+     */
+    template<typename S = float, typename P = float>
+    Algorithm(std::shared_ptr<vk::Device> device,
+              vk::DescriptorPool *pool,
+              const std::vector<std::shared_ptr<Tensor>>& tensors = {},
+              const std::vector<uint32_t>& spirv = {},
+              const Workgroup& workgroup = {},
+              const std::vector<S>& specializationConstants = {},
+              const std::vector<P>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
+
+        this->mDevice = device;
+        this->mDescriptorPool = pool;
+
+        if (tensors.size() && spirv.size()) {
+            KP_LOG_INFO(
+              "Kompute Algorithm initialising with tensor size: {} and "
+              "spirv size: {}",
+              tensors.size(),
+              spirv.size());
+            this->rebuild(tensors,
+                          spirv,
+                          workgroup,
+                          specializationConstants,
+                          pushConstants);
+        } else {
+            KP_LOG_INFO(
+              "Kompute Algorithm constructor with empty tensors and or "
+              "spirv so not rebuilding vulkan components");
+        }
+    }
+
+    /**
+     *  Rebuild function to reconstruct algorithm with configuration parameters
+     * to create the underlying resources.
+     *
+     *  @param tensors The tensors to use to create the descriptor resources
+     *  @param spirv The spirv code to use to create the algorithm
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The std::vector<float> to use
+     * to initialize the specialization constants which cannot be changed once
+     * set.
+     *  @param pushConstants (optional) The std::vector<float> to use when
+     * initializing the pipeline, which set the size of the push constants -
+     * these can be modified but all new values must have the same vector size
+     * as this initial value.
+     */
+    template<typename S = float, typename P = float>
+    void rebuild(const std::vector<std::shared_ptr<Tensor>>& tensors,
+                 const std::vector<uint32_t>& spirv,
+                 const Workgroup& workgroup = {},
+                 const std::vector<S>& specializationConstants = {},
+                 const std::vector<P>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute Algorithm rebuild started");
+
+        this->mTensors = tensors;
+        this->mSpirv = spirv;
+
+        if (specializationConstants.size()) {
+            if (this->mSpecializationConstantsData) {
+                free(this->mSpecializationConstantsData);
+            }
+            uint32_t memorySize =
+              sizeof(decltype(specializationConstants.back()));
+            uint32_t size = specializationConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mSpecializationConstantsData = malloc(totalSize);
+            memcpy(this->mSpecializationConstantsData,
+                   specializationConstants.data(),
+                   totalSize);
+            this->mSpecializationConstantsDataTypeMemorySize = memorySize;
+            this->mSpecializationConstantsSize = size;
+        }
+
+        if (pushConstants.size()) {
+            if (this->mPushConstantsData) {
+                free(this->mPushConstantsData);
+            }
+            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+            uint32_t size = pushConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mPushConstantsData = malloc(totalSize);
+            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
+            this->mPushConstantsDataTypeMemorySize = memorySize;
+            this->mPushConstantsSize = size;
+        }
+
+        this->setWorkgroup(
+          workgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
+
+        // Descriptor pool is created first so if available then destroy all
+        // before rebuild
+        if (this->isInit()) {
+            this->destroy();
+        }
+
+        this->createParameters();
+        this->createShaderModule();
+        this->createPipeline();
+    }
+
+    /**
+     * Destructor for Algorithm which is responsible for freeing and desroying
+     * respective pipelines and owned parameter groups.
+     */
+    ~Algorithm();
+
+    /**
+     * Records the dispatch function with the provided template parameters or
+     * alternatively using the size of the tensor by default.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordDispatch(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records command that binds the "core" algorithm components which consist
+     * of binding the pipeline and binding the descriptorsets.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordBindCore(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records command that binds the push constants to the command buffer
+     * provided
+     * - it is required that the pushConstants provided are of the same size as
+     * the ones provided during initialization.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordBindPush(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * function that checks all the gpu resource components to verify if these
+     * have been created and returns true if all are valid.
+     *
+     * @returns returns true if the algorithm is currently initialized.
+     */
+    bool isInit();
+
+    /**
+     * Sets the work group to use in the recordDispatch
+     *
+     * @param workgroup The kp::Workgroup value to use to update the algorithm.
+     * It must have a value greater than 1 on the x value (index 1) otherwise it
+     * will be initialized on the size of the first tensor (ie.
+     * this->mTensor[0]->size())
+     */
+    void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
+    /**
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush()
+     *
+     * @param pushConstants The templatable vector is to be used to set the push
+     * constants to use in the next bindPush(...) calls. The constants provided
+     * must be of the same size as the ones created during initialization.
+     */
+    template<typename T>
+    void setPushConstants(const std::vector<T>& pushConstants)
+    {
+        uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+        uint32_t size = pushConstants.size();
+        this->setPushConstants(pushConstants.data(), size, memorySize);
+    }
+
+    void updateDescriptors(vk::DescriptorPool *pool)
+    {
+        this->mDescriptorPool = pool;
+        this->setWorkgroup(
+          this->mWorkgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
+
+        this->updateParameters(); // TODO: See if we can reduce this
+    }
+
+    /**
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush() with the raw memory block location and memory size to be used.
+     *
+     * @param data The raw data point to copy the data from, without modifying
+     * the pointer.
+     * @param size The number of data elements provided in the data
+     * @param memorySize The memory size of each of the data elements in bytes.
+     */
+    void setPushConstants(const void* data, uint32_t size, uint32_t memorySize)
+    {
+
+        uint32_t totalSize = memorySize * size;
+        uint32_t previousTotalSize =
+          this->mPushConstantsDataTypeMemorySize * this->mPushConstantsSize;
+
+        if (totalSize != previousTotalSize) {
+            throw std::runtime_error(fmt::format(
+              "Kompute Algorithm push "
+              "constant total memory size provided is {} but expected {} bytes",
+              totalSize,
+              previousTotalSize));
+        }
+        if (this->mPushConstantsData) {
+            free(this->mPushConstantsData);
+        }
+
+        this->mPushConstantsData = malloc(totalSize);
+        memcpy(this->mPushConstantsData, data, totalSize);
+        this->mPushConstantsDataTypeMemorySize = memorySize;
+        this->mPushConstantsSize = size;
+    }
+
+    /**
+     * Gets the current workgroup from the algorithm.
+     *
+     * @param The kp::Constant to use to set the push constants to use in the
+     * next bindPush(...) calls. The constants provided must be of the same size
+     * as the ones created during initialization.
+     */
+    const Workgroup& getWorkgroup();
+    /**
+     * Gets the specialization constants of the current algorithm.
+     *
+     * @returns The std::vector<float> currently set for specialization
+     * constants
+     */
+    template<typename T>
+    const std::vector<T> getSpecializationConstants()
+    {
+        return { (T*)this->mSpecializationConstantsData,
+                 ((T*)this->mSpecializationConstantsData) +
+                   this->mSpecializationConstantsSize };
+    }
+    /**
+     * Gets the specialization constants of the current algorithm.
+     *
+     * @returns The std::vector<float> currently set for push constants
+     */
+    template<typename T>
+    const std::vector<T> getPushConstants()
+    {
+        return { (T*)this->mPushConstantsData,
+                 ((T*)this->mPushConstantsData) + this->mPushConstantsSize };
+    }
+    /**
+     * Gets the current tensors that are used in the algorithm.
+     *
+     * @returns The list of tensors used in the algorithm.
+     */
+    const std::vector<std::shared_ptr<Tensor>>& getTensors();
+    void setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    void destroy();
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::Device> mDevice;
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
+    bool mFreeDescriptorSetLayout = false;
+    vk::DescriptorPool *mDescriptorPool = nullptr;
+    std::shared_ptr<vk::DescriptorSet> mDescriptorSet;
+    bool mFreeDescriptorSet = false;
+    std::shared_ptr<vk::ShaderModule> mShaderModule;
+    bool mFreeShaderModule = false;
+    std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
+    bool mFreePipelineLayout = false;
+    std::shared_ptr<vk::PipelineCache> mPipelineCache;
+    bool mFreePipelineCache = false;
+    std::shared_ptr<vk::Pipeline> mPipeline;
+    bool mFreePipeline = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<uint32_t> mSpirv;
+    void* mSpecializationConstantsData = nullptr;
+    uint32_t mSpecializationConstantsDataTypeMemorySize = 0;
+    uint32_t mSpecializationConstantsSize = 0;
+    void* mPushConstantsData = nullptr;
+    uint32_t mPushConstantsDataTypeMemorySize = 0;
+    uint32_t mPushConstantsSize = 0;
+    Workgroup mWorkgroup;
+
+    // Create util functions
+    void createShaderModule();
+    void createPipeline();
+
+    // Parameters
+    void freeParameters();
+    void createParameters();
+    void updateParameters();
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/Core.hpp
+++ b/kompute/src/include/kompute/Core.hpp
@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <vulkan/vulkan.hpp>
+
+// Typedefs to simplify interaction with core types
+namespace kp {
+typedef std::array<uint32_t, 3> Workgroup;
+typedef std::vector<float> Constants;
+}
+
+// Must be after vulkan is included
+#ifndef KOMPUTE_VK_API_VERSION
+#ifndef KOMPUTE_VK_API_MAJOR_VERSION
+#define KOMPUTE_VK_API_MAJOR_VERSION 1
+#endif // KOMPUTE_VK_API_MAJOR_VERSION
+#ifndef KOMPUTE_VK_API_MINOR_VERSION
+#define KOMPUTE_VK_API_MINOR_VERSION 2
+#endif // KOMPUTE_VK_API_MINOR_VERSION
+#define KOMPUTE_VK_API_VERSION                                                 \
+    VK_MAKE_VERSION(                                                           \
+      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
+#endif // KOMPUTE_VK_API_VERSION
+
+#if defined(KOMPUTE_BUILD_PYTHON)
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+// from python/src/main.cpp
+extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
+#endif
--- a/kompute/src/include/kompute/Kompute.hpp
+++ b/kompute/src/include/kompute/Kompute.hpp
@ -0,0 +1,21 @@
+#pragma once
+
+#include "Algorithm.hpp"
+#include "Core.hpp"
+#include "Manager.hpp"
+#include "Sequence.hpp"
+#include "Tensor.hpp"
+
+#include "operations/OpAlgoDispatch.hpp"
+#include "operations/OpBase.hpp"
+#include "operations/OpMemoryBarrier.hpp"
+#include "operations/OpMult.hpp"
+#include "operations/OpTensorCopy.hpp"
+#include "operations/OpTensorSyncDevice.hpp"
+#include "operations/OpTensorSyncLocal.hpp"
+#include "operations/OpBufferSyncDevice.hpp"
+#include "operations/OpBufferSyncLocal.hpp"
+
+// Will be build by CMake and placed inside the build directory
+#include "ShaderLogisticRegression.hpp"
+#include "ShaderOpMult.hpp"
--- a/kompute/src/include/kompute/Manager.hpp
+++ b/kompute/src/include/kompute/Manager.hpp
@ -0,0 +1,267 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <set>
+#include <unordered_map>
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Sequence.hpp"
+#include "logger/Logger.hpp"
+
+#define KP_DEFAULT_SESSION "DEFAULT"
+
+namespace kp {
+
+/**
+    Base orchestrator which creates and manages device and child components
+*/
+class Manager
+{
+  public:
+    /**
+        Base constructor.
+    */
+    Manager();
+
+    /**
+     * Manager destructor which would ensure all owned resources are destroyed
+     * unless explicitly stated that resources should not be destroyed or freed.
+     */
+    ~Manager();
+
+    bool hasDevice() const {
+        return this->mDevice.get();
+    }
+
+    /**
+     * Initialize a device.
+     *
+     * @param physicalDeviceIndex The index of the physical device to use
+     * @param familyQueueIndices (Optional) List of queue indices to add for
+     * explicit allocation
+     * @param desiredExtensions The desired extensions to load from
+     * physicalDevice
+     */
+    void initializeDevice(uint32_t physicalDeviceIndex,
+            const std::vector<uint32_t>& familyQueueIndices = {},
+            const std::vector<std::string>& desiredExtensions = {});
+
+    /**
+     * Create a managed sequence that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param queueIndex The queue to use from the available queues
+     * @param nrOfTimestamps The maximum number of timestamps to allocate.
+     * If zero (default), disables latching of timestamps.
+     * @returns Shared pointer with initialised sequence
+     */
+    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
+                                       uint32_t totalTimestamps = 0);
+
+    /**
+     * Create a managed tensor that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param data The data to initialize the tensor with
+     * @param tensorType The type of tensor to initialize
+     * @returns Shared pointer with initialised tensor
+     */
+    template<typename T>
+    std::shared_ptr<TensorT<T>> tensorT(
+      const std::vector<T>& data,
+       vk::DeviceMemory *primaryMemory,
+       vk::Buffer *primaryBuffer,
+       vk::DeviceMemory *stagingMemory,
+       vk::Buffer *stagingBuffer,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
+
+        std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
+          this->mPhysicalDevice, this->mDevice, data, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    std::shared_ptr<Tensor> tensor(
+      void* data,
+      uint32_t elementTotalCount,
+      uint64_t memorySize,
+      const Tensor::TensorDataTypes& dataType,
+      vk::DeviceMemory *primaryMemory,
+      vk::Buffer *primaryBuffer,
+      vk::DeviceMemory *stagingMemory,
+      vk::Buffer *stagingBuffer,
+      vk::DeviceSize offset,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
+                                                       this->mDevice,
+                                                       data,
+                                                       elementTotalCount,
+                                                       memorySize,
+                                                       dataType,
+                                                       primaryMemory,
+                                                       primaryBuffer,
+                                                       stagingMemory,
+                                                       stagingBuffer,
+                                                       offset,
+                                                       tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    /**
+     * Default non-template function that can be used to create algorithm
+     * objects which provides default types to the push and spec constants as
+     * floats.
+     *
+     * @param tensors (optional) The tensors to initialise the algorithm with
+     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
+     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
+     * defaults to (tensor[0].size(), 1, 1)
+     * @param specializationConstants (optional) float vector to use for
+     * specialization constants, and defaults to an empty constant
+     * @param pushConstants (optional) float vector to use for push constants,
+     * and defaults to an empty constant
+     * @returns Shared pointer with initialised algorithm
+     */
+    std::shared_ptr<Algorithm> algorithm(
+      vk::DescriptorPool *pool,
+      const std::vector<std::shared_ptr<Tensor>>& tensors = {},
+      const std::vector<uint32_t>& spirv = {},
+      const Workgroup& workgroup = {},
+      const std::vector<float>& specializationConstants = {},
+      const std::vector<float>& pushConstants = {})
+    {
+        return this->algorithm<>(
+          pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
+    }
+
+    /**
+     * Create a managed algorithm that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param tensors (optional) The tensors to initialise the algorithm with
+     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
+     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
+     * defaults to (tensor[0].size(), 1, 1)
+     * @param specializationConstants (optional) templatable vector parameter to
+     * use for specialization constants, and defaults to an empty constant
+     * @param pushConstants (optional) templatable vector parameter to use for
+     * push constants, and defaults to an empty constant
+     * @returns Shared pointer with initialised algorithm
+     */
+    template<typename S = float, typename P = float>
+    std::shared_ptr<Algorithm> algorithm(
+      vk::DescriptorPool *pool,
+      const std::vector<std::shared_ptr<Tensor>>& tensors,
+      const std::vector<uint32_t>& spirv,
+      const Workgroup& workgroup,
+      const std::vector<S>& specializationConstants,
+      const std::vector<P>& pushConstants)
+    {
+
+        KP_LOG_DEBUG("Kompute Manager algorithm creation triggered");
+
+        std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
+          this->mDevice,
+          pool,
+          tensors,
+          spirv,
+          workgroup,
+          specializationConstants,
+          pushConstants) };
+
+        if (this->mManageResources) {
+            this->mManagedAlgorithms.push_back(algorithm);
+        }
+
+        return algorithm;
+    }
+
+    /**
+     * Destroy the GPU resources and all managed resources by manager.
+     **/
+    void destroy();
+    /**
+     * Run a pseudo-garbage collection to release all the managed resources
+     * that have been already freed due to these reaching to zero ref count.
+     **/
+    void clear();
+
+    /**
+     * Information about the current device.
+     *
+     * @return vk::PhysicalDeviceProperties containing information about the
+     *device
+     **/
+    vk::PhysicalDeviceProperties getDeviceProperties() const;
+
+    /**
+     * List the devices available in the current vulkan instance.
+     *
+     * @return vector of physical devices containing their respective properties
+     **/
+    std::vector<vk::PhysicalDevice> listDevices() const;
+
+    /**
+     * The current Vulkan instance.
+     *
+     * @return a shared pointer to the current Vulkan instance held by this
+     *object
+     **/
+    std::shared_ptr<vk::Instance> getVkInstance() const;
+
+    std::shared_ptr<vk::Device> device() const { return mDevice; }
+    std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
+
+  private:
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::Instance> mInstance = nullptr;
+    bool mFreeInstance = false;
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
+    std::shared_ptr<vk::Device> mDevice = nullptr;
+    bool mFreeDevice = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::weak_ptr<Tensor>> mManagedTensors;
+    std::vector<std::weak_ptr<Sequence>> mManagedSequences;
+    std::vector<std::weak_ptr<Algorithm>> mManagedAlgorithms;
+
+    std::vector<uint32_t> mComputeQueueFamilyIndices;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+
+    bool mManageResources = false;
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    vk::DebugReportCallbackEXT mDebugReportCallback;
+    vk::DispatchLoaderDynamic mDebugDispatcher;
+#endif
+
+    // Create functions
+    void createInstance();
+    void createDevice(const std::vector<uint32_t>& familyQueueIndices = {},
+                      uint32_t physicalDeviceIndex = 0,
+                      const std::vector<std::string>& desiredExtensions = {});
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/Sequence.hpp
+++ b/kompute/src/include/kompute/Sequence.hpp
@ -0,0 +1,313 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ *  Container of operations that can be sent to GPU as batch
+ */
+class Sequence : public std::enable_shared_from_this<Sequence>
+{
+  public:
+    /**
+     * Main constructor for sequence which requires core vulkan components to
+     * generate all dependent resources.
+     *
+     * @param physicalDevice Vulkan physical device
+     * @param device Vulkan logical device
+     * @param computeQueue Vulkan compute queue
+     * @param queueIndex Vulkan compute queue index in device
+     * @param totalTimestamps Maximum number of timestamps to allocate
+     */
+    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+             std::shared_ptr<vk::Device> device,
+             std::shared_ptr<vk::Queue> computeQueue,
+             uint32_t queueIndex,
+             uint32_t totalTimestamps = 0);
+    /**
+     * Destructor for sequence which is responsible for cleaning all subsequent
+     * owned operations.
+     */
+    ~Sequence();
+
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param op Object derived from kp::BaseOp that will be recoreded by the
+     * sequence which will be used when the operation is evaluated.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
+
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> record(
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->record(op);
+    }
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
+                                     TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->record(op);
+    }
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job synchronously (with a barrier).
+     *
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> eval();
+
+    /**
+     * Resets all the recorded and stored operations, records the operation
+     * provided and submits into the gpu as a submit job synchronously (with a
+     * barrier).
+     *
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(vk::Buffer *primaryBuffer,
+                                   vk::Buffer *stagingBuffer,
+                                   vk::DeviceSize size,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(primaryBuffer, stagingBuffer, size, std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    /**
+     * Eval Async sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    std::shared_ptr<Sequence> evalAsync();
+    /**
+     * Clears currnet operations to record provided one in the vector of
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> evalAsync(
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->evalAsync(op);
+    }
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
+                                        TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->evalAsync(op);
+    }
+
+    /**
+     * Eval Await waits for the fence to finish processing and then once it
+     * finishes, it runs the postEval of all operations.
+     *
+     * @param waitFor Number of milliseconds to wait before timing out.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
+
+    /**
+     * Clear function clears all operations currently recorded and starts
+     * recording again.
+     */
+    void clear();
+
+    /**
+     * Return the timestamps that were latched at the beginning and
+     * after each operation during the last eval() call.
+     */
+    std::vector<std::uint64_t> getTimestamps();
+
+    /**
+     * Begins recording commands for commands to be submitted into the command
+     * buffer.
+     */
+    void begin();
+
+    /**
+     * Ends the recording and stops recording commands when the record command
+     * is sent.
+     */
+    void end();
+
+    /**
+     * Returns true if the sequence is currently in recording activated.
+     *
+     * @return Boolean stating if recording ongoing.
+     */
+    bool isRecording() const;
+
+    /**
+     * Returns true if the sequence has been initialised, and it's based on the
+     * GPU resources being referenced.
+     *
+     * @return Boolean stating if is initialized
+     */
+    bool isInit() const;
+
+    /**
+     * Clears command buffer and triggers re-record of all the current
+     * operations saved, which is useful if the underlying kp::Tensors or
+     * kp::Algorithms are modified and need to be re-recorded.
+     */
+    void rerecord();
+
+    /**
+     * Returns true if the sequence is currently running - mostly used for async
+     * workloads.
+     *
+     * @return Boolean stating if currently running.
+     */
+    bool isRunning() const;
+
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
+     */
+    void destroy();
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
+    std::shared_ptr<vk::Device> mDevice = nullptr;
+    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
+    uint32_t mQueueIndex = -1;
+
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
+    bool mFreeCommandPool = false;
+    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
+    bool mFreeCommandBuffer = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    vk::Fence mFence;
+    std::vector<std::shared_ptr<OpBase>> mOperations{};
+    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
+
+    // State
+    bool mRecording = false;
+    bool mIsRunning = false;
+
+    // Create functions
+    void createCommandPool();
+    void createCommandBuffer();
+    void createTimestampQueryPool(uint32_t totalTimestamps);
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/Tensor.hpp
+++ b/kompute/src/include/kompute/Tensor.hpp
@ -0,0 +1,306 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "logger/Logger.hpp"
+#include <memory>
+#include <string>
+
+namespace kp {
+
+/**
+ * Structured data used in GPU operations.
+ *
+ * Tensors are the base building block in Kompute to perform operations across
+ * GPUs. Each tensor would have a respective Vulkan memory and buffer, which
+ * would be used to store their respective data. The tensors can be used for GPU
+ * data storage or transfer.
+ */
+class Tensor
+{
+  public:
+    /**
+     * Type for tensors created: Device allows memory to be transferred from
+     * staging buffers. Staging are host memory visible. Storage are device
+     * visible but are not set up to transfer or receive data (only for shader
+     * storage).
+     */
+    enum class TensorTypes
+    {
+        eDevice = 0,  ///< Type is device memory, source and destination
+        eHost = 1,    ///< Type is host memory, source and destination
+        eStorage = 2, ///< Type is Device memory (only)
+    };
+    enum class TensorDataTypes
+    {
+        eBool = 0,
+        eInt = 1,
+        eUnsignedInt = 2,
+        eFloat = 3,
+        eDouble = 4,
+    };
+
+    static std::string toString(TensorDataTypes dt);
+    static std::string toString(TensorTypes dt);
+
+    /**
+     *  Constructor with data provided which would be used to create the
+     * respective vulkan buffer and memory.
+     *
+     *  @param physicalDevice The physical device to use to fetch properties
+     *  @param device The device to use to create the buffer and memory from
+     *  @param data Non-zero-sized vector of data that will be used by the
+     * tensor
+     *  @param tensorTypes Type for the tensor which is of type TensorTypes
+     */
+    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           void* data,
+           uint32_t elementTotalCount,
+           uint32_t memorySize,
+           const TensorDataTypes& dataType,
+           vk::DeviceMemory *primaryMemory,
+           vk::Buffer *primaryBuffer,
+           vk::DeviceMemory *stagingMemory,
+           vk::Buffer *stagingBuffer,
+           vk::DeviceSize offset,
+           const TensorTypes& tensorType = TensorTypes::eDevice);
+
+    /**
+     * Destructor which is in charge of freeing vulkan resources unless they
+     * have been provided externally.
+     */
+    virtual ~Tensor();
+
+    /**
+     * Function to trigger reinitialisation of the tensor buffer and memory with
+     * new data as well as new potential device type.
+     *
+     * @param data Vector of data to use to initialise vector from
+     * @param tensorType The type to use for the tensor
+     */
+    void rebuild(void* data,
+                 uint32_t elementTotalCount,
+                 uint64_t memorySize,
+                 vk::DeviceMemory *primaryMemory,
+                 vk::Buffer *primaryBuffer,
+                 vk::DeviceMemory *stagingMemory,
+                 vk::Buffer *stagingBuffer,
+                 vk::DeviceSize offset);
+
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory.
+     */
+    void destroy();
+
+    /**
+     * Check whether tensor is initialized based on the created gpu resources.
+     *
+     * @returns Boolean stating whether tensor is initialized
+     */
+    bool isInit();
+
+    /**
+     * Retrieve the tensor type of the Tensor
+     *
+     * @return Tensor type of tensor
+     */
+    TensorTypes tensorType();
+
+    /**
+     * Records a copy from the memory of the tensor provided to the current
+     * thensor. This is intended to pass memory into a processing, to perform
+     * a staging buffer transfer, or to gather output (between others).
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param copyFromTensor Tensor to copy the data from
+     */
+    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                        std::shared_ptr<Tensor> copyFromTensor);
+
+    /**
+     * Records a copy from the internal staging memory to the device memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     */
+    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     */
+    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records the buffer memory barrier into the primary buffer and command
+     * buffer which ensures that relevant data transfers are carried out
+     * correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordPrimaryBufferMemoryBarrier(
+      const vk::CommandBuffer& commandBuffer,
+      vk::AccessFlagBits srcAccessMask,
+      vk::AccessFlagBits dstAccessMask,
+      vk::PipelineStageFlagBits srcStageMask,
+      vk::PipelineStageFlagBits dstStageMask);
+    /**
+     * Records the buffer memory barrier into the staging buffer and command
+     * buffer which ensures that relevant data transfers are carried out
+     * correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordStagingBufferMemoryBarrier(
+      const vk::CommandBuffer& commandBuffer,
+      vk::AccessFlagBits srcAccessMask,
+      vk::AccessFlagBits dstAccessMask,
+      vk::PipelineStageFlagBits srcStageMask,
+      vk::PipelineStageFlagBits dstStageMask);
+
+    /**
+     * Constructs a vulkan descriptor buffer info which can be used to specify
+     * and reference the underlying buffer component of the tensor without
+     * exposing it.
+     *
+     * @return Descriptor buffer info with own buffer
+     */
+    vk::DescriptorBufferInfo constructDescriptorBufferInfo();
+
+    /**
+     * Returns the size/magnitude of the Tensor, which will be the total number
+     * of elements across all dimensions
+     *
+     * @return Unsigned integer representing the total number of elements
+     */
+    uint32_t size();
+
+    /**
+     * Returns the total memory size of the data contained by the Tensor object
+     *
+     * @return Unsigned integer representing the memory of the tensor in bytes.
+     */
+    uint64_t memorySize();
+
+    /**
+     * Retrieve the data type of the tensor (host, device, storage)
+     *
+     * @return Data type of tensor of type kp::Tensor::TensorDataTypes
+     */
+    TensorDataTypes dataType();
+
+    /**
+     * Retrieve the raw data via the pointer to the memory that contains the raw
+     * memory of this current tensor. This tensor gets changed to a nullptr when
+     * the Tensor is removed.
+     *
+     * @return Pointer to raw memory containing raw bytes data of Tensor.
+     */
+    void* rawData();
+
+    /**
+     * Sets / resets the data of the tensor which is directly done on the GPU
+     * host visible memory available by the tensor.
+     */
+    void setRawData(const void* data);
+
+    /**
+     * Template to return the pointer data converted by specific type, which
+     * would be any of the supported types including float, double, int32,
+     * uint32 and bool.
+     *
+     * @return Pointer to raw memory containing raw bytes data of Tensor.
+     */
+    template<typename T>
+    T* data()
+    {
+        return (T*)this->mRawData;
+    }
+
+    /**
+     * Template to get the data of the current tensor as a vector of specific
+     * type, which would be any of the supported types including float, double,
+     * int32, uint32 and bool.
+     *
+     * @return Vector of type provided by template.
+     */
+    template<typename T>
+    std::vector<T> vector()
+    {
+        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
+    }
+
+  protected:
+    // -------------- ALWAYS OWNED RESOURCES
+    TensorTypes mTensorType;
+    TensorDataTypes mDataType;
+    uint32_t mSize = 0;
+    uint64_t mMemorySize = 0;
+    vk::DeviceSize mOffset = 0;
+    void* mRawData = nullptr;
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
+    std::shared_ptr<vk::Device> mDevice;
+    vk::Buffer *mPrimaryBuffer = nullptr;
+    vk::Buffer *mStagingBuffer = nullptr;
+    vk::DeviceMemory *mPrimaryMemory = nullptr;
+    vk::DeviceMemory *mStagingMemory = nullptr;
+
+    void setGPUResources(vk::DeviceMemory *primaryMemory,
+                         vk::Buffer *primaryBuffer,
+                         vk::DeviceMemory *stagingMemory,
+                         vk::Buffer *stagingBuffer,
+                         vk::DeviceSize offset);
+    void recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
+                          vk::Buffer *bufferFrom,
+                          vk::Buffer *bufferTo,
+                          vk::DeviceSize bufferSize,
+                          vk::BufferCopy copyRegion);
+    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                   const vk::Buffer& buffer,
+                                   vk::AccessFlagBits srcAccessMask,
+                                   vk::AccessFlagBits dstAccessMask,
+                                   vk::PipelineStageFlagBits srcStageMask,
+                                   vk::PipelineStageFlagBits dstStageMask);
+
+    // Private util functions
+    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
+    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
+    vk::BufferUsageFlags getStagingBufferUsageFlags();
+    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
+};
+
+template<typename T>
+class TensorT : public Tensor
+{
+
+  public:
+    ~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); }
+
+    TensorDataTypes dataType();
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/logger/Logger.hpp
+++ b/kompute/src/include/kompute/logger/Logger.hpp
@ -0,0 +1,197 @@
+#pragma once
+
+#define KOMPUTE_LOG_LEVEL_TRACE 0
+#define KOMPUTE_LOG_LEVEL_DEBUG 1
+#define KOMPUTE_LOG_LEVEL_INFO 2
+#define KOMPUTE_LOG_LEVEL_WARN 3
+#define KOMPUTE_LOG_LEVEL_ERROR 4
+#define KOMPUTE_LOG_LEVEL_CRITICAL 5
+#define KOMPUTE_LOG_LEVEL_OFF 6
+
+// Logging is disabled entirely.
+#if KOMPUTE_OPT_LOG_LEVEL_DISABLED
+#define KP_LOG_TRACE(...)
+#define KP_LOG_DEBUG(...)
+#define KP_LOG_INFO(...)
+#define KP_LOG_WARN(...)
+#define KP_LOG_ERROR(...)
+#else
+
+#if !KOMPUTE_OPT_USE_SPDLOG
+#if VK_USE_PLATFORM_ANDROID_KHR
+#include <android/log.h>
+#include <fmt/core.h>
+static const char* KOMPUTE_LOG_TAG = "KomputeLog";
+#else
+#if KOMPUTE_BUILD_PYTHON
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+// from python/src/main.cpp
+extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
+#else
+#include <fmt/core.h>
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#include <spdlog/spdlog.h>
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+#include <set>
+#include <string>
+#include <vector>
+namespace logger {
+// Setup the logger, note the loglevel can not be set below the CMake log level
+// (To change this use -DKOMPUTE_OPT_LOG_LEVEL=...)
+void
+setupLogger();
+
+// Logging is enabled, but we do not use Spdlog. So we use fmt in case nothing
+// else is defined, overriding logging.
+#if !KOMPUTE_OPT_USE_SPDLOG
+
+#ifndef KP_LOG_TRACE
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_TRACE
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_TRACE(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_VERBOSE, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_trace(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_TRACE(...)                                                      \
+    fmt::print("[{} {}] [trace] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_TRACE(...)
+#endif
+#endif // !KP_LOG_TRACE
+
+#ifndef KP_LOG_DEBUG
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_DEBUG(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_debug(fmt::format(__VA_ARGS__))
+#else
+#ifdef __FILE_NAME__ // gcc 12 provides only file name without path
+#define KP_LOG_DEBUG(...)                                                      \
+    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE_NAME__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_DEBUG(...)                                                      \
+    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // __FILE__NAME__
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_DEBUG(...)
+#endif
+#endif // !KP_LOG_DEBUG
+
+#ifndef KP_LOG_INFO
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_INFO(...)                                                       \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_info(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_INFO(...)                                                       \
+    fmt::print("[{} {}] [info] [{}:{}] {}\n",                                  \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_INFO(...)
+#endif
+#endif // !KP_LOG_INFO
+
+#ifndef KP_LOG_WARN
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_WARN
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_WARN(...)                                                       \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_WARN, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_warning(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_WARN(...)                                                       \
+    fmt::print("[{} {}] [warn] [{}:{}] {}\n",                                  \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_WARN(...)
+#endif
+#endif // !KP_LOG_WARN
+
+#ifndef KP_LOG_ERROR
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_ERROR
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_ERROR(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_ERROR, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_error(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_ERROR(...)                                                      \
+    fmt::print("[{} {}] [error] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_ERROR(...)
+#endif
+#endif // !KP_LOG_ERROR
+#else
+
+#define KP_LOG_TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
+#define KP_LOG_DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
+#define KP_LOG_INFO(...) SPDLOG_INFO(__VA_ARGS__)
+#define KP_LOG_WARN(...) SPDLOG_WARN(__VA_ARGS__)
+#define KP_LOG_ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
+
+void
+setLogLevel(spdlog::level::level_enum level);
+
+spdlog::level::level_enum
+getLogLevel();
+
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+} // namespace logger
+
+#endif // KOMPUTE_OPT_LOG_LEVEL_DISABLED
--- a/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
+++ b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
@ -0,0 +1,86 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of
+ * algorithm and parameter components which can be used with shaders.
+ * By default it enables the user to provide a dynamic number of tensors
+ * which are then passed as inputs.
+ */
+class OpAlgoDispatch : public OpBase
+{
+  public:
+    /**
+     * Constructor that stores the algorithm to use as well as the relevant
+     * push constants to override when recording.
+     *
+     * @param algorithm The algorithm object to use for dispatch
+     * @param pushConstants The push constants to use for override
+     */
+    template<typename T = float>
+    OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
+                   const std::vector<T>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor");
+
+        this->mAlgorithm = algorithm;
+
+        if (pushConstants.size()) {
+            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+            uint32_t size = pushConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mPushConstantsData = malloc(totalSize);
+            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
+            this->mPushConstantsDataTypeMemorySize = memorySize;
+            this->mPushConstantsSize = size;
+        }
+    }
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    virtual ~OpAlgoDispatch() override;
+
+    /**
+     * This records the commands that are to be sent to the GPU. This includes
+     * the barriers that ensure the memory has been copied before going in and
+     * out of the shader, as well as the dispatch operation that sends the
+     * shader processing to the gpu. This function also records the GPU memory
+     * copy of the output data for the staging buffer so it can be read by the
+     * host.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::shared_ptr<Algorithm> mAlgorithm;
+    void* mPushConstantsData = nullptr;
+    uint32_t mPushConstantsDataTypeMemorySize = 0;
+    uint32_t mPushConstantsSize = 0;
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/operations/OpBase.hpp
+++ b/kompute/src/include/kompute/operations/OpBase.hpp
@ -0,0 +1,62 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+/**
+ *  Base Operation which provides the high level interface that Kompute
+ *  operations implement in order to perform a set of actions in the GPU.
+ *
+ *  Operations can perform actions on tensors, and optionally can also own an
+ *  Algorithm with respective parameters. kp::Operations with kp::Algorithms
+ *  would inherit from kp::OpBaseAlgo.
+ */
+class OpBase
+{
+  public:
+    /**
+     * Default destructor for OpBase class. This OpBase destructor class should
+     * always be called to destroy and free owned resources unless it is
+     * intended to destroy the resources in the parent class.
+     */
+    virtual ~OpBase() { KP_LOG_DEBUG("Kompute OpBase destructor started"); }
+
+    /**
+     * The record function is intended to only send a record command or run
+     * commands that are expected to record operations that are to be submitted
+     * as a batch into the GPU.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
+
+    /**
+     * Pre eval is called before the Sequence has called eval and submitted the
+     * commands to the GPU for processing, and can be used to perform any
+     * per-eval setup steps required as the computation iteration begins. It's
+     * worth noting that there are situations where eval can be called multiple
+     * times, so the resources that are created should be idempotent in case
+     * it's called multiple times in a row.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
+
+    /**
+     * Post eval is called after the Sequence has called eval and submitted the
+     * commands to the GPU for processing, and can be used to perform any
+     * tear-down steps required as the computation iteration finishes. It's
+     * worth noting that there are situations where eval can be called multiple
+     * times, so the resources that are destroyed should not require a re-init
+     * unless explicitly provided by the user.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
+++ b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
@ -0,0 +1,50 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+class OpBufferSyncDevice : public OpBase
+{
+  public:
+    OpBufferSyncDevice(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpBufferSyncDevice() override;
+
+    /**
+     * For device buffers, it records the copy command for the buffer to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
+++ b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
@ -0,0 +1,50 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+class OpBufferSyncLocal : public OpBase
+{
+  public:
+    OpBufferSyncLocal(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpBufferSyncLocal() override;
+
+    /**
+     * For device buffers, it records the copy command for the buffer to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
+++ b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
@ -0,0 +1,81 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of
+ * algorithm and parameter components which can be used with shaders.
+ * It exposes the pipeline barrier functionality specifically for memory
+ * barriers that can be configured through the respective source and destination
+ * masks
+ */
+class OpMemoryBarrier : public OpBase
+{
+  public:
+    /**
+     * Constructor that stores tensors as well as memory barrier parameters to
+     * be used to create a pipeline barrier on the respective primary or staging
+     * tensor.
+     *
+     * @param tensors The tensors to apply the memory barriers on
+     * @param srcAccessMask The kp::AccessFlagBits for the source access mask
+     * @param dstAccessMask The kp::AccessFlagBits for the destination access
+     * mask
+     * @param srcStageMask The kp::PipelineStageFlagBits for the source stage
+     * mask
+     * @param dstStageMask The kp::PipelineStageFlagBits for the destination
+     * stage mask
+     * @param barrierOnPrimary Boolean to select primary or secondary buffers on
+     * tensors
+     */
+    OpMemoryBarrier(const std::vector<std::shared_ptr<Tensor>>& tensors,
+                    const vk::AccessFlagBits& srcAccessMask,
+                    const vk::AccessFlagBits& dstAccessMask,
+                    const vk::PipelineStageFlagBits& srcStageMask,
+                    const vk::PipelineStageFlagBits& dstStageMask,
+                    bool barrierOnPrimary = true);
+
+    /**
+     * Default destructor, which is in charge of destroying the reference to the
+     * tensors and all the relevant access / stage masks created
+     */
+    virtual ~OpMemoryBarrier() override;
+
+    /**
+     * This records the memory barrier with the access and stage masks provided
+     * across all relevant tensors.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    const vk::AccessFlagBits mSrcAccessMask;
+    const vk::AccessFlagBits mDstAccessMask;
+    const vk::PipelineStageFlagBits mSrcStageMask;
+    const vk::PipelineStageFlagBits mDstStageMask;
+    const bool mBarrierOnPrimary;
+    const std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/operations/OpMult.hpp
+++ b/kompute/src/include/kompute/operations/OpMult.hpp
@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <fstream>
+
+#include "kompute/Core.hpp"
+
+#include "ShaderOpMult.hpp"
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+
+namespace kp {
+
+/**
+ * Operation that performs multiplication on two tensors and outpus on third
+ * tensor.
+ */
+class OpMult : public OpAlgoDispatch
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the bare minimum
+     * requirements for the operations to be able to create and manage their
+     * sub-components.
+     *
+     * @param tensors Tensors that are to be used in this operation
+     * @param algorithm An algorithm that will be overridden with the OpMult
+     * shader data and the tensors provided which are expected to be 3
+     */
+    OpMult(std::vector<std::shared_ptr<Tensor>> tensors,
+           std::shared_ptr<Algorithm> algorithm)
+      : OpAlgoDispatch(algorithm)
+    {
+        KP_LOG_DEBUG("Kompute OpMult constructor with params");
+
+        if (tensors.size() != 3) {
+            throw std::runtime_error(
+              "Kompute OpMult expected 3 tensors but got " +
+              std::to_string(tensors.size()));
+        }
+
+        const std::vector<uint32_t> spirv = std::vector<uint32_t>(
+          SHADEROPMULT_COMP_SPV.begin(), SHADEROPMULT_COMP_SPV.end());
+
+        algorithm->rebuild<>(tensors, spirv);
+    }
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    ~OpMult() override { KP_LOG_DEBUG("Kompute OpMult destructor started"); }
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/operations/OpTensorCopy.hpp
+++ b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that copies the data from the first tensor to the rest of the
+ * tensors provided, using a record command for all the vectors. This operation
+ * does not own/manage the memory of the tensors passed to it. The operation
+ * must only receive tensors of type
+ */
+class OpTensorCopy : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorCopy() override;
+
+    /**
+     * Records the copy commands from the first tensor into all the other
+     * tensors provided. Also optionally records a barrier.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Copies the local vectors for all the tensors to sync the data with the
+     * gpu.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's device by mapping local data into the device
+ * memory. For TensorTypes::eDevice it will use a record operation for the
+ * memory to be syncd into GPU memory which means that the operation will be
+ * done in sync with GPU commands. For TensorTypes::eHost it will only map the
+ * data into host memory which will happen during preEval before the recorded
+ * commands are dispatched.
+ */
+class OpTensorSyncDevice : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensos
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncDevice() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
--- a/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's local memory by mapping device data into the
+ * local CPU memory. For TensorTypes::eDevice it will use a record operation
+ * for the memory to be syncd into GPU memory which means that the operation
+ * will be done in sync with GPU commands. For TensorTypes::eHost it will
+ * only map the data into host memory which will happen during preEval before
+ * the recorded commands are dispatched.
+ */
+class OpTensorSyncLocal : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensors
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncLocal() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its device to staging memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into
+     * local memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
--- a/kompute/src/logger/CMakeLists.txt
+++ b/kompute/src/logger/CMakeLists.txt
@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.20)
+
+set(LOGGER_SOURCES Logger.cpp)
+
+add_library(kp_logger ${LOGGER_SOURCES})
+
+# Define log levels in code
+add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_DEBUG=1)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_INFO=2)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_WARN=3)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_ERROR=4)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_CRITICAL=5)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_OFF=6)
+
+if(KOMPUTE_OPT_BUILD_PYTHON AND KOMPUTE_OPT_USE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_PYTHON' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
+endif()
+
+if(KOMPUTE_OPT_ANDROID_BUILD AND KOMPUTE_OPT_USE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_ANDROID_BUILD' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
+endif()
+
+if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Trace")
+    set(KOMPUTE_OPT_LOG_LEVEL TRACE)
+    message(STATUS "Using log level Trace")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Debug")
+    set(KOMPUTE_OPT_LOG_LEVEL DEBUG)
+    message(STATUS "Using log level Debug")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Info")
+    set(KOMPUTE_OPT_LOG_LEVEL INFO)
+    message(STATUS "Using log level Info")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Warn")
+    set(KOMPUTE_OPT_LOG_LEVEL WARN)
+    message(STATUS "Using log level Warn")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Error")
+    set(KOMPUTE_OPT_LOG_LEVEL ERROR)
+    message(STATUS "Using log level Error")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Critical")
+    set(KOMPUTE_OPT_LOG_LEVEL CRITICAL)
+    message(STATUS "Using log level Critical")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
+    set(KOMPUTE_OPT_LOG_LEVEL OFF)
+    message(STATUS "Using log level Off")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Default")
+    set(KOMPUTE_OPT_LOG_LEVEL $<IF:$<CONFIG:Debug>,DEBUG,INFO>)
+    message(STATUS "Setting KOMPUTE_OPT_LOG_LEVEL to according to the build type")
+else()
+    message(FATAL_ERROR "Log level '${KOMPUTE_OPT_LOG_LEVEL}' unknown, use -DKOMPUTE_OPT_LOG_LEVEL={Trace, Debug, Info, Warn, Error, Critical, Off, Default} to set it to a correct value.")
+endif()
+
+# Always make sure we define the Kompute log level independent of the Spdlog log level
+target_compile_definitions(kp_logger INTERFACE KOMPUTE_OPT_ACTIVE_LOG_LEVEL=KOMPUTE_LOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+
+# Link depending on how the logger should be setup
+if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
+    if(KOMPUTE_OPT_USE_SPDLOG)
+        target_link_libraries(kp_logger PUBLIC spdlog::spdlog)
+        target_compile_definitions(spdlog INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+        target_compile_definitions(kp_logger INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+        message(STATUS "setting SPDLOG_ACTIVE_LEVEL to SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL}")
+
+        if(KOMPUTE_OPT_SPDLOG_ASYNC_MODE)
+            target_compile_definitions(kp_logger INTERFACE KOMPUTE_SPDLOG_ASYNC_LOGGING=1)
+        endif()
+    else()
+        target_link_libraries(kp_logger PUBLIC fmt::fmt)
+    endif()
+endif()
--- a/kompute/src/logger/Logger.cpp
+++ b/kompute/src/logger/Logger.cpp
@ -0,0 +1,101 @@
+#include "kompute/logger/Logger.hpp"
+
+#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
+#if !KOMPUTE_OPT_USE_SPDLOG
+#else
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <spdlog/async.h>
+#include <spdlog/common.h>
+#include <spdlog/logger.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+#include <string>
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+
+namespace logger {
+#if !KOMPUTE_OPT_USE_SPDLOG
+
+void
+setupLogger()
+{
+}
+
+#else
+constexpr int THREAD_QUEUE_LENGTH = 8192;
+
+void
+setupLogger()
+{
+    // Ensure we setup the logger only once
+    static bool setup = false;
+    static std::mutex setupMutex{};
+    setupMutex.lock();
+    if (setup) {
+        setupMutex.unlock();
+        return;
+    }
+    setup = true;
+    setupMutex.unlock();
+
+    spdlog::init_thread_pool(THREAD_QUEUE_LENGTH, 1);
+    spdlog::sink_ptr console_sink =
+      std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
+#if SPDLOG_ACTIVE_LEVEL < SPDLOG_LEVEL_INFO
+    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=21s] %v");
+#else
+    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=15s] %v");
+#endif
+    std::vector<spdlog::sink_ptr> sinks{ console_sink };
+    // TODO: Add flag in compile flags
+    std::shared_ptr<spdlog::logger> logger =
+#if KOMPUTE_SPDLOG_ASYNC_LOGGING
+          std::make_shared<spdlog::async_logger>(
+            "",
+            sinks.begin(),
+            sinks.end(),
+            spdlog::thread_pool(),
+            spdlog::async_overflow_policy::block);
+#else
+          std::make_shared<spdlog::logger>(
+            "",
+            sinks.begin(),
+            sinks.end());
+#endif
+
+    logger->set_level(getLogLevel());
+
+    spdlog::set_default_logger(logger);
+}
+
+spdlog::level::level_enum
+getLogLevel()
+{
+#if SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_TRACE
+    return spdlog::level::trace;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_DEBUG
+    return spdlog::level::debug;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_INFO
+    return spdlog::level::info;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_WARN
+    return spdlog::level::warn;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_ERROR
+    return spdlog::level::error;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_CRITICAL
+    return spdlog::level::critical;
+#else
+    return spdlog::level::off;
+#endif
+}
+
+void
+setLogLevel(const spdlog::level::level_enum level)
+{
+    spdlog::default_logger()->set_level(level);
+}
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+} // namespace logger
+
+#endif
--- a/kompute/src/shaders/CMakeLists.txt
+++ b/kompute/src/shaders/CMakeLists.txt
@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# ######################
+cmake_minimum_required(VERSION 3.20)
+
+add_subdirectory(glsl)
--- a/kompute/src/shaders/glsl/CMakeLists.txt
+++ b/kompute/src/shaders/glsl/CMakeLists.txt
@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# ######################
+cmake_minimum_required(VERSION 3.20)
+
+# Check if build shaders from source is enabled
+if(KOMPUTE_OPT_BUILD_SHADERS)
+    vulkan_compile_shader(INFILE ShaderOpMult.comp
+        OUTFILE ShaderOpMult.hpp
+        NAMESPACE "kp")
+
+    vulkan_compile_shader(INFILE ShaderLogisticRegression.comp
+        OUTFILE ShaderLogisticRegression.hpp
+        NAMESPACE "kp")
+else() # Else we will use our precompiled versions
+    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderOpMult.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp)
+    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderLogisticRegression.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp)
+endif()
+
+add_library(kp_shader INTERFACE "${CMAKE_CURRENT_BINARY_DIR}/ShaderOpMult.hpp"
+    "${CMAKE_CURRENT_BINARY_DIR}/ShaderLogisticRegression.hpp")
+
+target_include_directories(kp_shader INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
+
+# Make sure we install shaders:
+install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
--- a/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
+++ b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
@ -0,0 +1,52 @@
+#version 450
+
+layout (constant_id = 0) const float m = 0;
+
+layout (local_size_x = 1) in;
+
+layout(set = 0, binding = 0) buffer bxi { float xi[]; };
+layout(set = 0, binding = 1) buffer bxj { float xj[]; };
+layout(set = 0, binding = 2) buffer by { float y[]; };
+layout(set = 0, binding = 3) buffer bwin { float win[]; };
+layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
+layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
+layout(set = 0, binding = 6) buffer bbin { float bin[]; };
+layout(set = 0, binding = 7) buffer bbout { float bout[]; };
+layout(set = 0, binding = 8) buffer blout { float lout[]; };
+
+float sigmoid(float z) {
+    return 1.0 / (1.0 + exp(-z));
+}
+
+float inference(vec2 x, vec2 w, float b) {
+    // Compute the linear mapping function
+    float z = dot(w, x) + b;
+    // Calculate the y-hat with sigmoid
+    float yHat = sigmoid(z);
+    return yHat;
+}
+
+float calculateLoss(float yHat, float y) {
+    return -(y * log(yHat)  +  (1.0 - y) * log(1.0 - yHat));
+}
+
+void main() {
+    uint idx = gl_GlobalInvocationID.x;
+
+    vec2 wCurr = vec2(win[0], win[1]);
+    float bCurr = bin[0];
+
+    vec2 xCurr = vec2(xi[idx], xj[idx]);
+    float yCurr = y[idx];
+
+    float yHat = inference(xCurr, wCurr, bCurr);
+
+    float dZ = yHat - yCurr;
+    vec2 dW = (1. / m) * xCurr * dZ;
+    float dB = (1. / m) * dZ;
+    wouti[idx] = dW.x;
+    woutj[idx] = dW.y;
+    bout[idx] = dB;
+
+    lout[idx] = calculateLoss(yHat, yCurr);
+}
--- a/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
+++ b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
@ -0,0 +1,310 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace kp {
+const std::array<uint32_t, 1204> SHADERLOGISTICREGRESSION_COMP_SPV = { 
+0x07230203, 0x00010000, 0x0008000a, 0x000000ae, 
+0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
+0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
+0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
+0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
+0x00000000, 0x00000041, 0x00060010, 0x00000004, 
+0x00000011, 0x00000001, 0x00000001, 0x00000001, 
+0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
+0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 
+0x0000000a, 0x6d676973, 0x2864696f, 0x003b3166, 
+0x00030005, 0x00000009, 0x0000007a, 0x00080005, 
+0x00000012, 0x65666e69, 0x636e6572, 0x66762865, 
+0x66763b32, 0x31663b32, 0x0000003b, 0x00030005, 
+0x0000000f, 0x00000078, 0x00030005, 0x00000010, 
+0x00000077, 0x00030005, 0x00000011, 0x00000062, 
+0x00080005, 0x00000017, 0x636c6163, 0x74616c75, 
+0x736f4c65, 0x31662873, 0x3b31663b, 0x00000000, 
+0x00040005, 0x00000015, 0x74614879, 0x00000000, 
+0x00030005, 0x00000016, 0x00000079, 0x00030005, 
+0x00000021, 0x0000007a, 0x00040005, 0x00000027, 
+0x74614879, 0x00000000, 0x00040005, 0x00000028, 
+0x61726170, 0x0000006d, 0x00030005, 0x0000003e, 
+0x00786469, 0x00080005, 0x00000041, 0x475f6c67, 
+0x61626f6c, 0x766e496c, 0x7461636f, 0x496e6f69, 
+0x00000044, 0x00040005, 0x00000046, 0x72754377, 
+0x00000072, 0x00040005, 0x00000048, 0x6e697762, 
+0x00000000, 0x00040006, 0x00000048, 0x00000000, 
+0x006e6977, 0x00030005, 0x0000004a, 0x00000000, 
+0x00040005, 0x00000054, 0x72754362, 0x00000072, 
+0x00040005, 0x00000056, 0x6e696262, 0x00000000, 
+0x00040006, 0x00000056, 0x00000000, 0x006e6962, 
+0x00030005, 0x00000058, 0x00000000, 0x00040005, 
+0x0000005b, 0x72754378, 0x00000072, 0x00030005, 
+0x0000005d, 0x00697862, 0x00040006, 0x0000005d, 
+0x00000000, 0x00006978, 0x00030005, 0x0000005f, 
+0x00000000, 0x00030005, 0x00000064, 0x006a7862, 
+0x00040006, 0x00000064, 0x00000000, 0x00006a78, 
+0x00030005, 0x00000066, 0x00000000, 0x00040005, 
+0x0000006b, 0x72754379, 0x00000072, 0x00030005, 
+0x0000006d, 0x00007962, 0x00040006, 0x0000006d, 
+0x00000000, 0x00000079, 0x00030005, 0x0000006f, 
+0x00000000, 0x00040005, 0x00000073, 0x74614879, 
+0x00000000, 0x00040005, 0x00000074, 0x61726170, 
+0x0000006d, 0x00040005, 0x00000076, 0x61726170, 
+0x0000006d, 0x00040005, 0x00000078, 0x61726170, 
+0x0000006d, 0x00030005, 0x0000007b, 0x00005a64, 
+0x00030005, 0x0000007f, 0x00005764, 0x00030005, 
+0x00000080, 0x0000006d, 0x00030005, 0x00000086, 
+0x00004264, 0x00040005, 0x0000008b, 0x756f7762, 
+0x00006974, 0x00050006, 0x0000008b, 0x00000000, 
+0x74756f77, 0x00000069, 0x00030005, 0x0000008d, 
+0x00000000, 0x00040005, 0x00000093, 0x756f7762, 
+0x00006a74, 0x00050006, 0x00000093, 0x00000000, 
+0x74756f77, 0x0000006a, 0x00030005, 0x00000095, 
+0x00000000, 0x00040005, 0x0000009c, 0x756f6262, 
+0x00000074, 0x00050006, 0x0000009c, 0x00000000, 
+0x74756f62, 0x00000000, 0x00030005, 0x0000009e, 
+0x00000000, 0x00040005, 0x000000a3, 0x756f6c62, 
+0x00000074, 0x00050006, 0x000000a3, 0x00000000, 
+0x74756f6c, 0x00000000, 0x00030005, 0x000000a5, 
+0x00000000, 0x00040005, 0x000000a7, 0x61726170, 
+0x0000006d, 0x00040005, 0x000000a9, 0x61726170, 
+0x0000006d, 0x00040047, 0x00000041, 0x0000000b, 
+0x0000001c, 0x00040047, 0x00000047, 0x00000006, 
+0x00000004, 0x00050048, 0x00000048, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000048, 
+0x00000003, 0x00040047, 0x0000004a, 0x00000022, 
+0x00000000, 0x00040047, 0x0000004a, 0x00000021, 
+0x00000003, 0x00040047, 0x00000055, 0x00000006, 
+0x00000004, 0x00050048, 0x00000056, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000056, 
+0x00000003, 0x00040047, 0x00000058, 0x00000022, 
+0x00000000, 0x00040047, 0x00000058, 0x00000021, 
+0x00000006, 0x00040047, 0x0000005c, 0x00000006, 
+0x00000004, 0x00050048, 0x0000005d, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000005d, 
+0x00000003, 0x00040047, 0x0000005f, 0x00000022, 
+0x00000000, 0x00040047, 0x0000005f, 0x00000021, 
+0x00000000, 0x00040047, 0x00000063, 0x00000006, 
+0x00000004, 0x00050048, 0x00000064, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000064, 
+0x00000003, 0x00040047, 0x00000066, 0x00000022, 
+0x00000000, 0x00040047, 0x00000066, 0x00000021, 
+0x00000001, 0x00040047, 0x0000006c, 0x00000006, 
+0x00000004, 0x00050048, 0x0000006d, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000006d, 
+0x00000003, 0x00040047, 0x0000006f, 0x00000022, 
+0x00000000, 0x00040047, 0x0000006f, 0x00000021, 
+0x00000002, 0x00040047, 0x00000080, 0x00000001, 
+0x00000000, 0x00040047, 0x0000008a, 0x00000006, 
+0x00000004, 0x00050048, 0x0000008b, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000008b, 
+0x00000003, 0x00040047, 0x0000008d, 0x00000022, 
+0x00000000, 0x00040047, 0x0000008d, 0x00000021, 
+0x00000004, 0x00040047, 0x00000092, 0x00000006, 
+0x00000004, 0x00050048, 0x00000093, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000093, 
+0x00000003, 0x00040047, 0x00000095, 0x00000022, 
+0x00000000, 0x00040047, 0x00000095, 0x00000021, 
+0x00000005, 0x00040047, 0x0000009b, 0x00000006, 
+0x00000004, 0x00050048, 0x0000009c, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000009c, 
+0x00000003, 0x00040047, 0x0000009e, 0x00000022, 
+0x00000000, 0x00040047, 0x0000009e, 0x00000021, 
+0x00000007, 0x00040047, 0x000000a2, 0x00000006, 
+0x00000004, 0x00050048, 0x000000a3, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x000000a3, 
+0x00000003, 0x00040047, 0x000000a5, 0x00000022, 
+0x00000000, 0x00040047, 0x000000a5, 0x00000021, 
+0x00000008, 0x00040047, 0x000000ad, 0x0000000b, 
+0x00000019, 0x00020013, 0x00000002, 0x00030021, 
+0x00000003, 0x00000002, 0x00030016, 0x00000006, 
+0x00000020, 0x00040020, 0x00000007, 0x00000007, 
+0x00000006, 0x00040021, 0x00000008, 0x00000006, 
+0x00000007, 0x00040017, 0x0000000c, 0x00000006, 
+0x00000002, 0x00040020, 0x0000000d, 0x00000007, 
+0x0000000c, 0x00060021, 0x0000000e, 0x00000006, 
+0x0000000d, 0x0000000d, 0x00000007, 0x00050021, 
+0x00000014, 0x00000006, 0x00000007, 0x00000007, 
+0x0004002b, 0x00000006, 0x00000019, 0x3f800000, 
+0x00040015, 0x0000003c, 0x00000020, 0x00000000, 
+0x00040020, 0x0000003d, 0x00000007, 0x0000003c, 
+0x00040017, 0x0000003f, 0x0000003c, 0x00000003, 
+0x00040020, 0x00000040, 0x00000001, 0x0000003f, 
+0x0004003b, 0x00000040, 0x00000041, 0x00000001, 
+0x0004002b, 0x0000003c, 0x00000042, 0x00000000, 
+0x00040020, 0x00000043, 0x00000001, 0x0000003c, 
+0x0003001d, 0x00000047, 0x00000006, 0x0003001e, 
+0x00000048, 0x00000047, 0x00040020, 0x00000049, 
+0x00000002, 0x00000048, 0x0004003b, 0x00000049, 
+0x0000004a, 0x00000002, 0x00040015, 0x0000004b, 
+0x00000020, 0x00000001, 0x0004002b, 0x0000004b, 
+0x0000004c, 0x00000000, 0x00040020, 0x0000004d, 
+0x00000002, 0x00000006, 0x0004002b, 0x0000004b, 
+0x00000050, 0x00000001, 0x0003001d, 0x00000055, 
+0x00000006, 0x0003001e, 0x00000056, 0x00000055, 
+0x00040020, 0x00000057, 0x00000002, 0x00000056, 
+0x0004003b, 0x00000057, 0x00000058, 0x00000002, 
+0x0003001d, 0x0000005c, 0x00000006, 0x0003001e, 
+0x0000005d, 0x0000005c, 0x00040020, 0x0000005e, 
+0x00000002, 0x0000005d, 0x0004003b, 0x0000005e, 
+0x0000005f, 0x00000002, 0x0003001d, 0x00000063, 
+0x00000006, 0x0003001e, 0x00000064, 0x00000063, 
+0x00040020, 0x00000065, 0x00000002, 0x00000064, 
+0x0004003b, 0x00000065, 0x00000066, 0x00000002, 
+0x0003001d, 0x0000006c, 0x00000006, 0x0003001e, 
+0x0000006d, 0x0000006c, 0x00040020, 0x0000006e, 
+0x00000002, 0x0000006d, 0x0004003b, 0x0000006e, 
+0x0000006f, 0x00000002, 0x00040032, 0x00000006, 
+0x00000080, 0x00000000, 0x0003001d, 0x0000008a, 
+0x00000006, 0x0003001e, 0x0000008b, 0x0000008a, 
+0x00040020, 0x0000008c, 0x00000002, 0x0000008b, 
+0x0004003b, 0x0000008c, 0x0000008d, 0x00000002, 
+0x0003001d, 0x00000092, 0x00000006, 0x0003001e, 
+0x00000093, 0x00000092, 0x00040020, 0x00000094, 
+0x00000002, 0x00000093, 0x0004003b, 0x00000094, 
+0x00000095, 0x00000002, 0x0004002b, 0x0000003c, 
+0x00000097, 0x00000001, 0x0003001d, 0x0000009b, 
+0x00000006, 0x0003001e, 0x0000009c, 0x0000009b, 
+0x00040020, 0x0000009d, 0x00000002, 0x0000009c, 
+0x0004003b, 0x0000009d, 0x0000009e, 0x00000002, 
+0x0003001d, 0x000000a2, 0x00000006, 0x0003001e, 
+0x000000a3, 0x000000a2, 0x00040020, 0x000000a4, 
+0x00000002, 0x000000a3, 0x0004003b, 0x000000a4, 
+0x000000a5, 0x00000002, 0x0006002c, 0x0000003f, 
+0x000000ad, 0x00000097, 0x00000097, 0x00000097, 
+0x00050036, 0x00000002, 0x00000004, 0x00000000, 
+0x00000003, 0x000200f8, 0x00000005, 0x0004003b, 
+0x0000003d, 0x0000003e, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000046, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000054, 0x00000007, 0x0004003b, 
+0x0000000d, 0x0000005b, 0x00000007, 0x0004003b, 
+0x00000007, 0x0000006b, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000073, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000074, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000076, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000078, 0x00000007, 0x0004003b, 
+0x00000007, 0x0000007b, 0x00000007, 0x0004003b, 
+0x0000000d, 0x0000007f, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000086, 0x00000007, 0x0004003b, 
+0x00000007, 0x000000a7, 0x00000007, 0x0004003b, 
+0x00000007, 0x000000a9, 0x00000007, 0x00050041, 
+0x00000043, 0x00000044, 0x00000041, 0x00000042, 
+0x0004003d, 0x0000003c, 0x00000045, 0x00000044, 
+0x0003003e, 0x0000003e, 0x00000045, 0x00060041, 
+0x0000004d, 0x0000004e, 0x0000004a, 0x0000004c, 
+0x0000004c, 0x0004003d, 0x00000006, 0x0000004f, 
+0x0000004e, 0x00060041, 0x0000004d, 0x00000051, 
+0x0000004a, 0x0000004c, 0x00000050, 0x0004003d, 
+0x00000006, 0x00000052, 0x00000051, 0x00050050, 
+0x0000000c, 0x00000053, 0x0000004f, 0x00000052, 
+0x0003003e, 0x00000046, 0x00000053, 0x00060041, 
+0x0000004d, 0x00000059, 0x00000058, 0x0000004c, 
+0x0000004c, 0x0004003d, 0x00000006, 0x0000005a, 
+0x00000059, 0x0003003e, 0x00000054, 0x0000005a, 
+0x0004003d, 0x0000003c, 0x00000060, 0x0000003e, 
+0x00060041, 0x0000004d, 0x00000061, 0x0000005f, 
+0x0000004c, 0x00000060, 0x0004003d, 0x00000006, 
+0x00000062, 0x00000061, 0x0004003d, 0x0000003c, 
+0x00000067, 0x0000003e, 0x00060041, 0x0000004d, 
+0x00000068, 0x00000066, 0x0000004c, 0x00000067, 
+0x0004003d, 0x00000006, 0x00000069, 0x00000068, 
+0x00050050, 0x0000000c, 0x0000006a, 0x00000062, 
+0x00000069, 0x0003003e, 0x0000005b, 0x0000006a, 
+0x0004003d, 0x0000003c, 0x00000070, 0x0000003e, 
+0x00060041, 0x0000004d, 0x00000071, 0x0000006f, 
+0x0000004c, 0x00000070, 0x0004003d, 0x00000006, 
+0x00000072, 0x00000071, 0x0003003e, 0x0000006b, 
+0x00000072, 0x0004003d, 0x0000000c, 0x00000075, 
+0x0000005b, 0x0003003e, 0x00000074, 0x00000075, 
+0x0004003d, 0x0000000c, 0x00000077, 0x00000046, 
+0x0003003e, 0x00000076, 0x00000077, 0x0004003d, 
+0x00000006, 0x00000079, 0x00000054, 0x0003003e, 
+0x00000078, 0x00000079, 0x00070039, 0x00000006, 
+0x0000007a, 0x00000012, 0x00000074, 0x00000076, 
+0x00000078, 0x0003003e, 0x00000073, 0x0000007a, 
+0x0004003d, 0x00000006, 0x0000007c, 0x00000073, 
+0x0004003d, 0x00000006, 0x0000007d, 0x0000006b, 
+0x00050083, 0x00000006, 0x0000007e, 0x0000007c, 
+0x0000007d, 0x0003003e, 0x0000007b, 0x0000007e, 
+0x00050088, 0x00000006, 0x00000081, 0x00000019, 
+0x00000080, 0x0004003d, 0x0000000c, 0x00000082, 
+0x0000005b, 0x0005008e, 0x0000000c, 0x00000083, 
+0x00000082, 0x00000081, 0x0004003d, 0x00000006, 
+0x00000084, 0x0000007b, 0x0005008e, 0x0000000c, 
+0x00000085, 0x00000083, 0x00000084, 0x0003003e, 
+0x0000007f, 0x00000085, 0x00050088, 0x00000006, 
+0x00000087, 0x00000019, 0x00000080, 0x0004003d, 
+0x00000006, 0x00000088, 0x0000007b, 0x00050085, 
+0x00000006, 0x00000089, 0x00000087, 0x00000088, 
+0x0003003e, 0x00000086, 0x00000089, 0x0004003d, 
+0x0000003c, 0x0000008e, 0x0000003e, 0x00050041, 
+0x00000007, 0x0000008f, 0x0000007f, 0x00000042, 
+0x0004003d, 0x00000006, 0x00000090, 0x0000008f, 
+0x00060041, 0x0000004d, 0x00000091, 0x0000008d, 
+0x0000004c, 0x0000008e, 0x0003003e, 0x00000091, 
+0x00000090, 0x0004003d, 0x0000003c, 0x00000096, 
+0x0000003e, 0x00050041, 0x00000007, 0x00000098, 
+0x0000007f, 0x00000097, 0x0004003d, 0x00000006, 
+0x00000099, 0x00000098, 0x00060041, 0x0000004d, 
+0x0000009a, 0x00000095, 0x0000004c, 0x00000096, 
+0x0003003e, 0x0000009a, 0x00000099, 0x0004003d, 
+0x0000003c, 0x0000009f, 0x0000003e, 0x0004003d, 
+0x00000006, 0x000000a0, 0x00000086, 0x00060041, 
+0x0000004d, 0x000000a1, 0x0000009e, 0x0000004c, 
+0x0000009f, 0x0003003e, 0x000000a1, 0x000000a0, 
+0x0004003d, 0x0000003c, 0x000000a6, 0x0000003e, 
+0x0004003d, 0x00000006, 0x000000a8, 0x00000073, 
+0x0003003e, 0x000000a7, 0x000000a8, 0x0004003d, 
+0x00000006, 0x000000aa, 0x0000006b, 0x0003003e, 
+0x000000a9, 0x000000aa, 0x00060039, 0x00000006, 
+0x000000ab, 0x00000017, 0x000000a7, 0x000000a9, 
+0x00060041, 0x0000004d, 0x000000ac, 0x000000a5, 
+0x0000004c, 0x000000a6, 0x0003003e, 0x000000ac, 
+0x000000ab, 0x000100fd, 0x00010038, 0x00050036, 
+0x00000006, 0x0000000a, 0x00000000, 0x00000008, 
+0x00030037, 0x00000007, 0x00000009, 0x000200f8, 
+0x0000000b, 0x0004003d, 0x00000006, 0x0000001a, 
+0x00000009, 0x0004007f, 0x00000006, 0x0000001b, 
+0x0000001a, 0x0006000c, 0x00000006, 0x0000001c, 
+0x00000001, 0x0000001b, 0x0000001b, 0x00050081, 
+0x00000006, 0x0000001d, 0x00000019, 0x0000001c, 
+0x00050088, 0x00000006, 0x0000001e, 0x00000019, 
+0x0000001d, 0x000200fe, 0x0000001e, 0x00010038, 
+0x00050036, 0x00000006, 0x00000012, 0x00000000, 
+0x0000000e, 0x00030037, 0x0000000d, 0x0000000f, 
+0x00030037, 0x0000000d, 0x00000010, 0x00030037, 
+0x00000007, 0x00000011, 0x000200f8, 0x00000013, 
+0x0004003b, 0x00000007, 0x00000021, 0x00000007, 
+0x0004003b, 0x00000007, 0x00000027, 0x00000007, 
+0x0004003b, 0x00000007, 0x00000028, 0x00000007, 
+0x0004003d, 0x0000000c, 0x00000022, 0x00000010, 
+0x0004003d, 0x0000000c, 0x00000023, 0x0000000f, 
+0x00050094, 0x00000006, 0x00000024, 0x00000022, 
+0x00000023, 0x0004003d, 0x00000006, 0x00000025, 
+0x00000011, 0x00050081, 0x00000006, 0x00000026, 
+0x00000024, 0x00000025, 0x0003003e, 0x00000021, 
+0x00000026, 0x0004003d, 0x00000006, 0x00000029, 
+0x00000021, 0x0003003e, 0x00000028, 0x00000029, 
+0x00050039, 0x00000006, 0x0000002a, 0x0000000a, 
+0x00000028, 0x0003003e, 0x00000027, 0x0000002a, 
+0x0004003d, 0x00000006, 0x0000002b, 0x00000027, 
+0x000200fe, 0x0000002b, 0x00010038, 0x00050036, 
+0x00000006, 0x00000017, 0x00000000, 0x00000014, 
+0x00030037, 0x00000007, 0x00000015, 0x00030037, 
+0x00000007, 0x00000016, 0x000200f8, 0x00000018, 
+0x0004003d, 0x00000006, 0x0000002e, 0x00000016, 
+0x0004003d, 0x00000006, 0x0000002f, 0x00000015, 
+0x0006000c, 0x00000006, 0x00000030, 0x00000001, 
+0x0000001c, 0x0000002f, 0x00050085, 0x00000006, 
+0x00000031, 0x0000002e, 0x00000030, 0x0004003d, 
+0x00000006, 0x00000032, 0x00000016, 0x00050083, 
+0x00000006, 0x00000033, 0x00000019, 0x00000032, 
+0x0004003d, 0x00000006, 0x00000034, 0x00000015, 
+0x00050083, 0x00000006, 0x00000035, 0x00000019, 
+0x00000034, 0x0006000c, 0x00000006, 0x00000036, 
+0x00000001, 0x0000001c, 0x00000035, 0x00050085, 
+0x00000006, 0x00000037, 0x00000033, 0x00000036, 
+0x00050081, 0x00000006, 0x00000038, 0x00000031, 
+0x00000037, 0x0004007f, 0x00000006, 0x00000039, 
+0x00000038, 0x000200fe, 0x00000039, 0x00010038 };
+} // namespace kp
+
+
--- a/kompute/src/shaders/glsl/ShaderOpMult.comp
+++ b/kompute/src/shaders/glsl/ShaderOpMult.comp
@ -0,0 +1,28 @@
+#version 450
+
+layout(set = 0, binding = 0) buffer tensorLhs {
+   float valuesLhs[ ];
+};
+
+layout(set = 0, binding = 1) buffer tensorRhs {
+   float valuesRhs[ ];
+};
+
+layout(set = 0, binding = 2) buffer tensorOutput {
+   float valuesOutput[ ];
+};
+
+layout (constant_id = 0) const uint LEN_LHS = 0;
+layout (constant_id = 1) const uint LEN_RHS = 0;
+layout (constant_id = 2) const uint LEN_OUT = 0;
+
+layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+void main() 
+{
+	uint index = gl_GlobalInvocationID.x;
+
+    valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+}
+
+
--- a/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
+++ b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
@ -0,0 +1,101 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace kp {
+const std::array<uint32_t, 366> SHADEROPMULT_COMP_SPV = { 
+0x07230203, 0x00010000, 0x0008000a, 0x0000002e, 
+0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
+0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
+0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
+0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
+0x00000000, 0x0000000b, 0x00060010, 0x00000004, 
+0x00000011, 0x00000001, 0x00000001, 0x00000001, 
+0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
+0x00000004, 0x6e69616d, 0x00000000, 0x00040005, 
+0x00000008, 0x65646e69, 0x00000078, 0x00080005, 
+0x0000000b, 0x475f6c67, 0x61626f6c, 0x766e496c, 
+0x7461636f, 0x496e6f69, 0x00000044, 0x00060005, 
+0x00000012, 0x736e6574, 0x754f726f, 0x74757074, 
+0x00000000, 0x00070006, 0x00000012, 0x00000000, 
+0x756c6176, 0x754f7365, 0x74757074, 0x00000000, 
+0x00030005, 0x00000014, 0x00000000, 0x00050005, 
+0x00000019, 0x736e6574, 0x684c726f, 0x00000073, 
+0x00060006, 0x00000019, 0x00000000, 0x756c6176, 
+0x684c7365, 0x00000073, 0x00030005, 0x0000001b, 
+0x00000000, 0x00050005, 0x00000021, 0x736e6574, 
+0x6852726f, 0x00000073, 0x00060006, 0x00000021, 
+0x00000000, 0x756c6176, 0x68527365, 0x00000073, 
+0x00030005, 0x00000023, 0x00000000, 0x00040005, 
+0x00000029, 0x5f4e454c, 0x0053484c, 0x00040005, 
+0x0000002a, 0x5f4e454c, 0x00534852, 0x00040005, 
+0x0000002b, 0x5f4e454c, 0x0054554f, 0x00040047, 
+0x0000000b, 0x0000000b, 0x0000001c, 0x00040047, 
+0x00000011, 0x00000006, 0x00000004, 0x00050048, 
+0x00000012, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000012, 0x00000003, 0x00040047, 
+0x00000014, 0x00000022, 0x00000000, 0x00040047, 
+0x00000014, 0x00000021, 0x00000002, 0x00040047, 
+0x00000018, 0x00000006, 0x00000004, 0x00050048, 
+0x00000019, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000019, 0x00000003, 0x00040047, 
+0x0000001b, 0x00000022, 0x00000000, 0x00040047, 
+0x0000001b, 0x00000021, 0x00000000, 0x00040047, 
+0x00000020, 0x00000006, 0x00000004, 0x00050048, 
+0x00000021, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000021, 0x00000003, 0x00040047, 
+0x00000023, 0x00000022, 0x00000000, 0x00040047, 
+0x00000023, 0x00000021, 0x00000001, 0x00040047, 
+0x00000029, 0x00000001, 0x00000000, 0x00040047, 
+0x0000002a, 0x00000001, 0x00000001, 0x00040047, 
+0x0000002b, 0x00000001, 0x00000002, 0x00040047, 
+0x0000002d, 0x0000000b, 0x00000019, 0x00020013, 
+0x00000002, 0x00030021, 0x00000003, 0x00000002, 
+0x00040015, 0x00000006, 0x00000020, 0x00000000, 
+0x00040020, 0x00000007, 0x00000007, 0x00000006, 
+0x00040017, 0x00000009, 0x00000006, 0x00000003, 
+0x00040020, 0x0000000a, 0x00000001, 0x00000009, 
+0x0004003b, 0x0000000a, 0x0000000b, 0x00000001, 
+0x0004002b, 0x00000006, 0x0000000c, 0x00000000, 
+0x00040020, 0x0000000d, 0x00000001, 0x00000006, 
+0x00030016, 0x00000010, 0x00000020, 0x0003001d, 
+0x00000011, 0x00000010, 0x0003001e, 0x00000012, 
+0x00000011, 0x00040020, 0x00000013, 0x00000002, 
+0x00000012, 0x0004003b, 0x00000013, 0x00000014, 
+0x00000002, 0x00040015, 0x00000015, 0x00000020, 
+0x00000001, 0x0004002b, 0x00000015, 0x00000016, 
+0x00000000, 0x0003001d, 0x00000018, 0x00000010, 
+0x0003001e, 0x00000019, 0x00000018, 0x00040020, 
+0x0000001a, 0x00000002, 0x00000019, 0x0004003b, 
+0x0000001a, 0x0000001b, 0x00000002, 0x00040020, 
+0x0000001d, 0x00000002, 0x00000010, 0x0003001d, 
+0x00000020, 0x00000010, 0x0003001e, 0x00000021, 
+0x00000020, 0x00040020, 0x00000022, 0x00000002, 
+0x00000021, 0x0004003b, 0x00000022, 0x00000023, 
+0x00000002, 0x00040032, 0x00000006, 0x00000029, 
+0x00000000, 0x00040032, 0x00000006, 0x0000002a, 
+0x00000000, 0x00040032, 0x00000006, 0x0000002b, 
+0x00000000, 0x0004002b, 0x00000006, 0x0000002c, 
+0x00000001, 0x0006002c, 0x00000009, 0x0000002d, 
+0x0000002c, 0x0000002c, 0x0000002c, 0x00050036, 
+0x00000002, 0x00000004, 0x00000000, 0x00000003, 
+0x000200f8, 0x00000005, 0x0004003b, 0x00000007, 
+0x00000008, 0x00000007, 0x00050041, 0x0000000d, 
+0x0000000e, 0x0000000b, 0x0000000c, 0x0004003d, 
+0x00000006, 0x0000000f, 0x0000000e, 0x0003003e, 
+0x00000008, 0x0000000f, 0x0004003d, 0x00000006, 
+0x00000017, 0x00000008, 0x0004003d, 0x00000006, 
+0x0000001c, 0x00000008, 0x00060041, 0x0000001d, 
+0x0000001e, 0x0000001b, 0x00000016, 0x0000001c, 
+0x0004003d, 0x00000010, 0x0000001f, 0x0000001e, 
+0x0004003d, 0x00000006, 0x00000024, 0x00000008, 
+0x00060041, 0x0000001d, 0x00000025, 0x00000023, 
+0x00000016, 0x00000024, 0x0004003d, 0x00000010, 
+0x00000026, 0x00000025, 0x00050085, 0x00000010, 
+0x00000027, 0x0000001f, 0x00000026, 0x00060041, 
+0x0000001d, 0x00000028, 0x00000014, 0x00000016, 
+0x00000017, 0x0003003e, 0x00000028, 0x00000027, 
+0x000100fd, 0x00010038 };
+} // namespace kp
+
+
--- a/kompute/src/shaders/hlsl/computeheadless.comp
+++ b/kompute/src/shaders/hlsl/computeheadless.comp
@ -0,0 +1,29 @@
+// Copyright 2020 Google LLC
+
+RWStructuredBuffer<uint> values : register(u0);
+[[vk::constant_id(0)]] const uint BUFFER_ELEMENTS = 32;
+
+uint fibonacci(uint n) {
+	if(n <= 1){
+		return n;
+	}
+	uint curr = 1;
+	uint prev = 1;
+	for(uint i = 2; i < n; ++i) {
+		uint temp = curr;
+		curr += prev;
+		prev = temp;
+	}
+	return curr;
+}
+
+[numthreads(1, 1, 1)]
+void main(uint3 GlobalInvocationID : SV_DispatchThreadID)
+{
+	uint index = GlobalInvocationID.x;
+	if (index >= BUFFER_ELEMENTS)
+		return;
+	values[index] = fibonacci(values[index]);
+}
+
+