InfiniTensor · xgqdut2016 · Jan 19, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/include/infinicore/graph/graph.hpp b/include/infinicore/graph/graph.hpp
@@ -30,17 +30,21 @@ class GraphOperator {
 
 class Graph {
 public:
-    Graph() = default;
-    ~Graph() = default;
+    Graph();
+    ~Graph();
 
     void run() const;
 
 protected:
     void add_operator(std::shared_ptr<GraphOperator> op);
-
+    void instantiate();
     std::vector<std::shared_ptr<GraphOperator>> op_list_;
 
     friend class GraphManager;
+
+private:
+    struct DeviceGraph;
+    std::unique_ptr<DeviceGraph> device_graph_;
 };
 } // namespace infinicore::graph
 

diff --git a/include/infinicore/tensor.hpp b/include/infinicore/tensor.hpp
@@ -133,7 +133,18 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
 
     void debug() const;
 
-    Tensor to_blob() const;
+    /**
+     * Unsafe API that returns a new tensor with the same raw memory untracked by allocator
+     * This API is used for loosely tracking a piece of memory while allowing it to be reused,
+     * typically in a compute graph scenario.
+     */
+    Tensor to_blob_() const;
+
+    /**
+     * Unsafe API that returns a new tensor with the same memory and let allocator retracks the memory.
+     * Should only be used on the tensor returned by to_blob_().
+     */
+    Tensor resume_from_blob_() const;
 
     ///
     /// Data Transfer APIs
@@ -299,6 +310,10 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
 protected:
     TensorMetaData meta_;
     TensorData data_;
+
+private:
+    // Mark to indicate if the tensor is created from to_blob_()
+    bool to_blob_mark_ = false;
 };
 
 } // namespace infinicore
diff --git a/include/infiniop/ops/swiglu_cuda.h b/include/infiniop/ops/swiglu_cuda.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_SWIGLU_CUDA_API_H__
+#define __INFINIOP_SWIGLU_CUDA_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSwiGLUCudaDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSwiGLUCudaDescriptor(infiniopHandle_t handle,
+                                                               infiniopSwiGLUCudaDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t c_desc,
+                                                               infiniopTensorDescriptor_t a_desc,
+                                                               infiniopTensorDescriptor_t b_desc);
+
+__C __export infiniStatus_t infiniopGetSwiGLUCudaWorkspaceSize(infiniopSwiGLUCudaDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSwiGLUCuda(infiniopSwiGLUCudaDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *c,
+                                               void const *a,
+                                               void const *b,
+                                               void *stream);
+
+__C __export infiniStatus_t infiniopDestroySwiGLUCudaDescriptor(infiniopSwiGLUCudaDescriptor_t desc);
+
+#endif
diff --git a/include/infinirt.h b/include/infinirt.h
@@ -6,6 +6,9 @@
 
 typedef void *infinirtStream_t;
 typedef void *infinirtEvent_t;
+typedef void *infinirtGraph_t;
+typedef void *infinirtGraphNode_t;
+typedef void *infinirtGraphExec_t;
 
 __C __export infiniStatus_t infinirtInit();
 
@@ -63,4 +66,24 @@ __C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size
 __C __export infiniStatus_t infinirtMallocAsync(void **p_ptr, size_t size, infinirtStream_t stream);
 __C __export infiniStatus_t infinirtFreeAsync(void *ptr, infinirtStream_t stream);
 
+// Graph
+typedef enum {
+    INFINIRT_STREAM_CAPTURE_MODE_GLOBAL = 0,
+    INFINIRT_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
+    INFINIRT_STREAM_CAPTURE_MODE_RELAXED = 2,
+
+} infinirtStreamCaptureMode_t;
+
+__C __export infiniStatus_t infinirtStreamBeginCapture(infinirtStream_t stream, infinirtStreamCaptureMode_t mode);
+__C __export infiniStatus_t infinirtStreamEndCapture(infinirtStream_t stream, infinirtGraph_t *graph_ptr);
+__C __export infiniStatus_t infinirtGraphDestroy(infinirtGraph_t graph);
+__C __export infiniStatus_t infinirtGraphInstantiate(
+    infinirtGraphExec_t *graph_exec_ptr,
+    infinirtGraph_t graph,
+    infinirtGraphNode_t *node_ptr,
+    char *log_buffer,
+    size_t buffer_size);
+__C __export infiniStatus_t infinirtGraphExecDestroy(infinirtGraphExec_t graph_exec);
+__C __export infiniStatus_t infinirtGraphLuanch(infinirtGraphExec_t graph_exec, infinirtStream_t stream);
+
 #endif // __INFINIRT_API_H__
diff --git a/src/infinicore/context/allocators/pinnable_block_allocator.cc b/src/infinicore/context/allocators/pinnable_block_allocator.cc
@@ -52,9 +52,19 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
         if (size <= cls.block_size) {
             if (!cls.free_blocks.empty()) {
                 block = cls.free_blocks.back();
-                cls.free_blocks.pop_back();
-                block->in_use = true;
-                return reinterpret_cast<std::byte *>(block->ptr);
+                while (block != nullptr && block->in_use) {
+                    cls.free_blocks.pop_back();
+                    if (cls.free_blocks.empty()) {
+                        block = nullptr;
+                        break;
+                    }
+                    block = cls.free_blocks.back();
+                }
+                if (block != nullptr) {
+                    cls.free_blocks.pop_back();
+                    block->in_use = true;
+                    return reinterpret_cast<std::byte *>(block->ptr);
+                }
             }
             // Allocate a new block for this class
             block = std::make_shared<Block>();
@@ -125,6 +135,16 @@ void PinnableBlockAllocator::deallocate(std::byte *ptr) {
     }
 }
 
+size_t PinnableBlockAllocator::mark_in_use_(void *ptr, bool in_use) {
+    auto it = all_blocks_.find(reinterpret_cast<void *>(ptr));
+    if (it == all_blocks_.end()) {
+        throw std::runtime_error("Pointer not allocated by this allocator");
+    }
+    std::lock_guard<std::mutex> lock(mutex_);
+    it->second->in_use = in_use;
+    return it->second->size;
+}
+
 // ------------------- trim -------------------
 void PinnableBlockAllocator::trim() {
     std::lock_guard<std::mutex> lock(mutex_);

diff --git a/src/infinicore/context/allocators/pinnable_block_allocator.hpp b/src/infinicore/context/allocators/pinnable_block_allocator.hpp
@@ -32,6 +32,10 @@ class PinnableBlockAllocator : public MemoryAllocator {
     // Switch pinned/graph mode
     void set_pin_mode(bool pinned) { pinned_mode_ = pinned; }
 
+    // internal use only, force set in_use flag for a mem block
+    // return the size of the block
+    size_t mark_in_use_(void *ptr, bool in_use);
+
     // trim cached blocks back to GPU (not pinned)
     void trim();
 

diff --git a/src/infinicore/context/context_impl.cc b/src/infinicore/context/context_impl.cc
@@ -1,4 +1,5 @@
 #include "context_impl.hpp"
+#include "internal.hpp"
 
 #include "../utils.hpp"
 
@@ -194,6 +195,12 @@ void addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
 std::shared_ptr<graph::Graph> stopGraphRecording() {
     return ContextImpl::singleton().getCurrentRuntime()->stopGraphRecording();
 }
+
+std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob) {
+    setDevice(blob->device());
+    return ContextImpl::singleton().getCurrentRuntime()->reinstantiateBlob(blob);
+}
+
 } // namespace context
 
 } // namespace infinicore
diff --git a/src/infinicore/context/internal.hpp b/src/infinicore/context/internal.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "infinicore/device.hpp"
+#include "infinicore/memory.hpp"
+
+#include "infinicore/graph/graph.hpp"
+
+namespace infinicore::context {
+std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);
+};
diff --git a/src/infinicore/context/runtime/runtime.cc b/src/infinicore/context/runtime/runtime.cc
@@ -77,6 +77,15 @@ std::shared_ptr<Memory> Runtime::allocatePinnedHostMemory(size_t size) {
         true);
 }
 
+std::shared_ptr<Memory> Runtime::reinstantiateBlob(std::shared_ptr<Memory> blob) {
+    device_memory_allocator_.get()->mark_in_use_(blob->data(), true);
+    return std::make_shared<Memory>(
+        blob->data(), blob->size(), device_,
+        [alloc = device_memory_allocator_.get()](std::byte *p) {
+            alloc->deallocate(p);
+        });
+}
+
 void Runtime::memcpyH2D(void *dst, const void *src, size_t size, bool async) {
     if (async) {
         INFINICORE_CHECK_ERROR(infinirtMemcpyAsync(dst, src, size, INFINIRT_MEMCPY_H2D, stream_));

diff --git a/src/infinicore/context/runtime/runtime.hpp b/src/infinicore/context/runtime/runtime.hpp
@@ -37,6 +37,7 @@ class Runtime {
 
     std::shared_ptr<Memory> allocateMemory(size_t size);
     std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size);
+    std::shared_ptr<Memory> reinstantiateBlob(std::shared_ptr<Memory> blob);
 
     void memcpyH2D(void *dst, const void *src, size_t size, bool async = true);
     void memcpyD2H(void *dst, const void *src, size_t size);

diff --git a/src/infinicore/graph/graph.cc b/src/infinicore/graph/graph.cc
@@ -1,14 +1,16 @@
 #include "graph_manager.hpp"
 
 #include "../utils.hpp"
+#include "infinicore/context/context.hpp"
+#include <infinirt.h>
 
 namespace infinicore::graph {
 
 /* =========================
  * GraphTensor
  * ========================= */
 
-GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob()) {
+GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob_()) {
 }
 
 /* =========================
@@ -29,16 +31,91 @@ GraphOperator::~GraphOperator() {
  * Graph
  * ========================= */
 
+struct Graph::DeviceGraph {
+    infinirtGraph_t graph;
+    infinirtGraphExec_t exec;
+    infinirtGraphNode_t node;
+    std::vector<char> log_buffer;
+
+    DeviceGraph() {
+        log_buffer.resize(4 * 1024);
+    }
+
+    ~DeviceGraph() {
+        if (exec) {
+            infinirtGraphExecDestroy(exec);
+        }
+        if (graph) {
+            infinirtGraphDestroy(graph);
+        }
+    }
+
+    void launch() {
+        INFINICORE_CHECK_ERROR(infinirtGraphLuanch(exec, context::getStream()));
+    }
+};
+
+Graph::Graph() {
+}
+
 void Graph::run() const {
-    for (auto &op : op_list_) {
-        op->run();
+    if (device_graph_ != nullptr && device_graph_.get()->exec != nullptr) {
+        device_graph_.get()->launch();
+    } else {
+        for (auto &op : op_list_) {
+            op->run();
+        }
     }
 }
 
 void Graph::add_operator(std::shared_ptr<GraphOperator> op) {
     op_list_.push_back(op);
 }
 
+void Graph::instantiate() {
+    // Reset device graph
+    device_graph_ = std::make_unique<DeviceGraph>();
+
+    // warmup
+    for (size_t iter = 0; iter < 5; ++iter) {
+        this->run();
+    }
+    infinicore::context::syncStream();
+
+    if (infinirtStreamBeginCapture(
+            context::getStream(),
+            INFINIRT_STREAM_CAPTURE_MODE_GLOBAL)
+        != INFINI_STATUS_SUCCESS) {
+        return;
+    }
+
+    // Run and record
+    this->run();
+
+    if (infinirtStreamEndCapture(
+            context::getStream(),
+            &device_graph_.get()->graph)
+        != INFINI_STATUS_SUCCESS) {
+        return;
+    }
+
+    if (infinirtGraphInstantiate(
+            &device_graph_.get()->exec,
+            device_graph_.get()->graph,
+            &device_graph_.get()->node,
+            device_graph_.get()->log_buffer.data(),
+            device_graph_.get()->log_buffer.size())
+        != INFINI_STATUS_SUCCESS) {
+        static bool warned_once = false;
+        if (!warned_once) {
+            warned_once = true;
+            spdlog::warn("Fail to instantiate device graph: {}", std::string(device_graph_.get()->log_buffer.data()));
+        }
+    }
+}
+
+Graph::~Graph() = default;
+
 /* =========================
  * GraphManager
  * ========================= */
@@ -48,19 +125,26 @@ bool GraphManager::is_recording() const {
 }
 
 void GraphManager::start_recording() {
+    if (is_recording()) {
+        spdlog::warn("Graph is already recording. Previous recording will be dropped.");
+    }
     recording_ = true;
     graph_ = std::make_shared<Graph>();
 }
 
 void GraphManager::add_operator(std::shared_ptr<GraphOperator> op) {
-    INFINICORE_ASSERT(recording_);
+    INFINICORE_ASSERT(is_recording());
 
     graph_->add_operator(op);
 }
 
 std::shared_ptr<Graph> GraphManager::stop_recording() {
-
+    if (!is_recording()) {
+        spdlog::warn("Graph is not recording. Please start recording first.");
+        return nullptr;
+    }
     recording_ = false;
+    graph_->instantiate();
     return std::exchange(graph_, nullptr);
 }
 

diff --git a/src/infinicore/tensor/copy.cc b/src/infinicore/tensor/copy.cc
@@ -38,7 +38,7 @@ void TensorImpl::copy_from(Tensor src) {
             } else {
                 auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
                 context::setDevice(src->device());
-                context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
+                context::memcpyD2H(local_src->data(), src->data(), copy_size);
                 op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
             }
         } else if (src->device().getType() == Device::Type::CPU) {