Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions include/infinicore/graph/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,21 @@ class GraphOperator {

class Graph {
public:
Graph() = default;
~Graph() = default;
Graph();
~Graph();

void run() const;

protected:
void add_operator(std::shared_ptr<GraphOperator> op);

void instantiate();
std::vector<std::shared_ptr<GraphOperator>> op_list_;

friend class GraphManager;

private:
struct DeviceGraph;
std::unique_ptr<DeviceGraph> device_graph_;
};
} // namespace infinicore::graph

Expand Down
17 changes: 16 additions & 1 deletion include/infinicore/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,18 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {

void debug() const;

Tensor to_blob() const;
/**
* Unsafe API that returns a new tensor with the same raw memory untracked by allocator
* This API is used for loosely tracking a piece of memory while allowing it to be reused,
* typically in a compute graph scenario.
*/
Tensor to_blob_() const;

/**
* Unsafe API that returns a new tensor with the same memory and let allocator retracks the memory.
* Should only be used on the tensor returned by to_blob_().
*/
Tensor resume_from_blob_() const;

///
/// Data Transfer APIs
Expand Down Expand Up @@ -299,6 +310,10 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
protected:
TensorMetaData meta_;
TensorData data_;

private:
// Mark to indicate if the tensor is created from to_blob_()
bool to_blob_mark_ = false;
};

} // namespace infinicore
26 changes: 26 additions & 0 deletions include/infiniop/ops/swiglu_cuda.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef __INFINIOP_SWIGLU_CUDA_API_H__
#define __INFINIOP_SWIGLU_CUDA_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopSwiGLUCudaDescriptor_t;

__C __export infiniStatus_t infiniopCreateSwiGLUCudaDescriptor(infiniopHandle_t handle,
infiniopSwiGLUCudaDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);

__C __export infiniStatus_t infiniopGetSwiGLUCudaWorkspaceSize(infiniopSwiGLUCudaDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopSwiGLUCuda(infiniopSwiGLUCudaDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
void const *a,
void const *b,
void *stream);

__C __export infiniStatus_t infiniopDestroySwiGLUCudaDescriptor(infiniopSwiGLUCudaDescriptor_t desc);

#endif
23 changes: 23 additions & 0 deletions include/infinirt.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

typedef void *infinirtStream_t;
typedef void *infinirtEvent_t;
typedef void *infinirtGraph_t;
typedef void *infinirtGraphNode_t;
typedef void *infinirtGraphExec_t;

__C __export infiniStatus_t infinirtInit();

Expand Down Expand Up @@ -63,4 +66,24 @@ __C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size
__C __export infiniStatus_t infinirtMallocAsync(void **p_ptr, size_t size, infinirtStream_t stream);
__C __export infiniStatus_t infinirtFreeAsync(void *ptr, infinirtStream_t stream);

// Graph
typedef enum {
INFINIRT_STREAM_CAPTURE_MODE_GLOBAL = 0,
INFINIRT_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
INFINIRT_STREAM_CAPTURE_MODE_RELAXED = 2,

} infinirtStreamCaptureMode_t;

__C __export infiniStatus_t infinirtStreamBeginCapture(infinirtStream_t stream, infinirtStreamCaptureMode_t mode);
__C __export infiniStatus_t infinirtStreamEndCapture(infinirtStream_t stream, infinirtGraph_t *graph_ptr);
__C __export infiniStatus_t infinirtGraphDestroy(infinirtGraph_t graph);
__C __export infiniStatus_t infinirtGraphInstantiate(
infinirtGraphExec_t *graph_exec_ptr,
infinirtGraph_t graph,
infinirtGraphNode_t *node_ptr,
char *log_buffer,
size_t buffer_size);
__C __export infiniStatus_t infinirtGraphExecDestroy(infinirtGraphExec_t graph_exec);
__C __export infiniStatus_t infinirtGraphLuanch(infinirtGraphExec_t graph_exec, infinirtStream_t stream);

#endif // __INFINIRT_API_H__
26 changes: 23 additions & 3 deletions src/infinicore/context/allocators/pinnable_block_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,19 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
if (size <= cls.block_size) {
if (!cls.free_blocks.empty()) {
block = cls.free_blocks.back();
cls.free_blocks.pop_back();
block->in_use = true;
return reinterpret_cast<std::byte *>(block->ptr);
while (block != nullptr && block->in_use) {
cls.free_blocks.pop_back();
if (cls.free_blocks.empty()) {
block = nullptr;
break;
}
block = cls.free_blocks.back();
}
if (block != nullptr) {
cls.free_blocks.pop_back();
block->in_use = true;
return reinterpret_cast<std::byte *>(block->ptr);
}
}
// Allocate a new block for this class
block = std::make_shared<Block>();
Expand Down Expand Up @@ -125,6 +135,16 @@ void PinnableBlockAllocator::deallocate(std::byte *ptr) {
}
}

size_t PinnableBlockAllocator::mark_in_use_(void *ptr, bool in_use) {
auto it = all_blocks_.find(reinterpret_cast<void *>(ptr));
if (it == all_blocks_.end()) {
throw std::runtime_error("Pointer not allocated by this allocator");
}
std::lock_guard<std::mutex> lock(mutex_);
it->second->in_use = in_use;
return it->second->size;
}

// ------------------- trim -------------------
void PinnableBlockAllocator::trim() {
std::lock_guard<std::mutex> lock(mutex_);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ class PinnableBlockAllocator : public MemoryAllocator {
// Switch pinned/graph mode
void set_pin_mode(bool pinned) { pinned_mode_ = pinned; }

// internal use only, force set in_use flag for a mem block
// return the size of the block
size_t mark_in_use_(void *ptr, bool in_use);

// trim cached blocks back to GPU (not pinned)
void trim();

Expand Down
7 changes: 7 additions & 0 deletions src/infinicore/context/context_impl.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "context_impl.hpp"
#include "internal.hpp"

#include "../utils.hpp"

Expand Down Expand Up @@ -194,6 +195,12 @@ void addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
std::shared_ptr<graph::Graph> stopGraphRecording() {
return ContextImpl::singleton().getCurrentRuntime()->stopGraphRecording();
}

std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob) {
setDevice(blob->device());
return ContextImpl::singleton().getCurrentRuntime()->reinstantiateBlob(blob);
}

} // namespace context

} // namespace infinicore
10 changes: 10 additions & 0 deletions src/infinicore/context/internal.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

#include "infinicore/device.hpp"
#include "infinicore/memory.hpp"

#include "infinicore/graph/graph.hpp"

namespace infinicore::context {
std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);
};
9 changes: 9 additions & 0 deletions src/infinicore/context/runtime/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,15 @@ std::shared_ptr<Memory> Runtime::allocatePinnedHostMemory(size_t size) {
true);
}

std::shared_ptr<Memory> Runtime::reinstantiateBlob(std::shared_ptr<Memory> blob) {
device_memory_allocator_.get()->mark_in_use_(blob->data(), true);
return std::make_shared<Memory>(
blob->data(), blob->size(), device_,
[alloc = device_memory_allocator_.get()](std::byte *p) {
alloc->deallocate(p);
});
}

void Runtime::memcpyH2D(void *dst, const void *src, size_t size, bool async) {
if (async) {
INFINICORE_CHECK_ERROR(infinirtMemcpyAsync(dst, src, size, INFINIRT_MEMCPY_H2D, stream_));
Expand Down
1 change: 1 addition & 0 deletions src/infinicore/context/runtime/runtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class Runtime {

std::shared_ptr<Memory> allocateMemory(size_t size);
std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size);
std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);

void memcpyH2D(void *dst, const void *src, size_t size, bool async = true);
void memcpyD2H(void *dst, const void *src, size_t size);
Expand Down
94 changes: 89 additions & 5 deletions src/infinicore/graph/graph.cc
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
#include "graph_manager.hpp"

#include "../utils.hpp"
#include "infinicore/context/context.hpp"
#include <infinirt.h>

namespace infinicore::graph {

/* =========================
* GraphTensor
* ========================= */

GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob()) {
GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob_()) {
}

/* =========================
Expand All @@ -29,16 +31,91 @@ GraphOperator::~GraphOperator() {
* Graph
* ========================= */

struct Graph::DeviceGraph {
infinirtGraph_t graph;
infinirtGraphExec_t exec;
infinirtGraphNode_t node;
std::vector<char> log_buffer;

DeviceGraph() {
log_buffer.resize(4 * 1024);
}

~DeviceGraph() {
if (exec) {
infinirtGraphExecDestroy(exec);
}
if (graph) {
infinirtGraphDestroy(graph);
}
}

void launch() {
INFINICORE_CHECK_ERROR(infinirtGraphLuanch(exec, context::getStream()));
}
};

Graph::Graph() {
}

void Graph::run() const {
for (auto &op : op_list_) {
op->run();
if (device_graph_ != nullptr && device_graph_.get()->exec != nullptr) {
device_graph_.get()->launch();
} else {
for (auto &op : op_list_) {
op->run();
}
}
}

void Graph::add_operator(std::shared_ptr<GraphOperator> op) {
op_list_.push_back(op);
}

void Graph::instantiate() {
// Reset device graph
device_graph_ = std::make_unique<DeviceGraph>();

// warmup
for (size_t iter = 0; iter < 5; ++iter) {
this->run();
}
infinicore::context::syncStream();

if (infinirtStreamBeginCapture(
context::getStream(),
INFINIRT_STREAM_CAPTURE_MODE_GLOBAL)
!= INFINI_STATUS_SUCCESS) {
return;
}

// Run and record
this->run();

if (infinirtStreamEndCapture(
context::getStream(),
&device_graph_.get()->graph)
!= INFINI_STATUS_SUCCESS) {
return;
}

if (infinirtGraphInstantiate(
&device_graph_.get()->exec,
device_graph_.get()->graph,
&device_graph_.get()->node,
device_graph_.get()->log_buffer.data(),
device_graph_.get()->log_buffer.size())
!= INFINI_STATUS_SUCCESS) {
static bool warned_once = false;
if (!warned_once) {
warned_once = true;
spdlog::warn("Fail to instantiate device graph: {}", std::string(device_graph_.get()->log_buffer.data()));
}
}
}

Graph::~Graph() = default;

/* =========================
* GraphManager
* ========================= */
Expand All @@ -48,19 +125,26 @@ bool GraphManager::is_recording() const {
}

void GraphManager::start_recording() {
if (is_recording()) {
spdlog::warn("Graph is already recording. Previous recording will be dropped.");
}
recording_ = true;
graph_ = std::make_shared<Graph>();
}

void GraphManager::add_operator(std::shared_ptr<GraphOperator> op) {
INFINICORE_ASSERT(recording_);
INFINICORE_ASSERT(is_recording());

graph_->add_operator(op);
}

std::shared_ptr<Graph> GraphManager::stop_recording() {

if (!is_recording()) {
spdlog::warn("Graph is not recording. Please start recording first.");
return nullptr;
}
recording_ = false;
graph_->instantiate();
return std::exchange(graph_, nullptr);
}

Expand Down
2 changes: 1 addition & 1 deletion src/infinicore/tensor/copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ void TensorImpl::copy_from(Tensor src) {
} else {
auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
context::setDevice(src->device());
context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
context::memcpyD2H(local_src->data(), src->data(), copy_size);
op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
}
} else if (src->device().getType() == Device::Type::CPU) {
Expand Down
Loading
Loading