Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c3075ba
initial setup
LinoGiger Jan 27, 2026
e2a6604
improved logic of upload
LinoGiger Jan 27, 2026
2835de1
Merge branch 'main' into feat/RAPID-6800-batch-url-upload
LinoGiger Jan 27, 2026
02b01c9
added checking of parameters on upload config and removed order confi…
LinoGiger Jan 27, 2026
770c332
changed tqdm to be tqdm.auto
LinoGiger Jan 27, 2026
191b583
initial test with concurrent datapoint creation
LinoGiger Jan 28, 2026
ecf0433
added double progressbar
LinoGiger Jan 28, 2026
2d03906
improved speed on checking assets
LinoGiger Jan 28, 2026
7f72e01
moved update of progress bar
LinoGiger Jan 28, 2026
b984b82
Revert "moved update of progress bar"
LinoGiger Jan 28, 2026
042d4f5
refactored for better readability
LinoGiger Jan 28, 2026
4406464
moved ability to disable cache to disable cacheToDisk
LinoGiger Jan 28, 2026
58740de
create failed upload from exception instead of custom where possible
LinoGiger Jan 28, 2026
a97cbec
removed unused timeout
LinoGiger Jan 28, 2026
aea8981
adjusted input in orchestrator and added better doc strings for index…
LinoGiger Jan 28, 2026
67fb6ab
upped batchsize to 1000
LinoGiger Jan 28, 2026
c96f895
upped cacheTimout
LinoGiger Jan 29, 2026
b5210b2
simultaneous batch creation and pulling
LinoGiger Jan 29, 2026
28fe637
fixed cached assets not being counted to completed
LinoGiger Jan 29, 2026
d56c354
removed access to private method
LinoGiger Jan 29, 2026
6bc756d
fixed log that would be too big
LinoGiger Jan 29, 2026
841b642
test batch intterupt
LinoGiger Jan 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tqdm import tqdm
from tqdm.auto import tqdm

from rapidata.rapidata_client.config import logger
from rapidata.rapidata_client.config.rapidata_config import rapidata_config
Expand Down
14 changes: 0 additions & 14 deletions src/rapidata/rapidata_client/config/order_config.py

This file was deleted.

4 changes: 0 additions & 4 deletions src/rapidata/rapidata_client/config/rapidata_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from pydantic import BaseModel, Field

from rapidata.rapidata_client.config.logging_config import LoggingConfig
from rapidata.rapidata_client.config.order_config import OrderConfig
from rapidata.rapidata_client.config.upload_config import UploadConfig


Expand All @@ -15,8 +14,6 @@ class RapidataConfig(BaseModel):
enableBetaFeatures (bool): Whether to enable beta features. Defaults to False.
upload (UploadConfig): The configuration for the upload process.
Such as the maximum number of worker threads for processing media paths and the maximum number of retries for failed uploads.
order (OrderConfig): The configuration for the order process.
Such as the minimum number of datapoints required so that an automatic validationset gets created if no recommended was found.
logging (LoggingConfig): The configuration for the logging process.
Such as the logging level and the logging file.

Expand All @@ -29,7 +26,6 @@ class RapidataConfig(BaseModel):

enableBetaFeatures: bool = False
upload: UploadConfig = Field(default_factory=UploadConfig)
order: OrderConfig = Field(default_factory=OrderConfig)
logging: LoggingConfig = Field(default_factory=LoggingConfig)


Expand Down
37 changes: 31 additions & 6 deletions src/rapidata/rapidata_client/config/upload_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path
import shutil
from pydantic import BaseModel, Field, field_validator
from pydantic import BaseModel, ConfigDict, Field, field_validator
from rapidata.rapidata_client.config import logger


Expand All @@ -11,19 +11,29 @@ class UploadConfig(BaseModel):
Attributes:
maxWorkers (int): The maximum number of worker threads for concurrent uploads. Defaults to 25.
maxRetries (int): The maximum number of retries for failed uploads. Defaults to 3.
cacheUploads (bool): Enable/disable upload caching. Defaults to True.
cacheToDisk (bool): Enable disk-based caching for file uploads. If False, uses in-memory cache only. Defaults to True.
Note: URL assets are always cached in-memory regardless of this setting.
Caching cannot be disabled entirely as it's required for the two-step upload flow.
cacheTimeout (float): Cache operation timeout in seconds. Defaults to 0.1.
cacheLocation (Path): Directory for cache storage. Defaults to ~/.cache/rapidata/upload_cache.
This is immutable
This is immutable. Only used for file uploads when cacheToDisk=True.
cacheShards (int): Number of cache shards for parallel access. Defaults to 128.
Higher values improve concurrency but increase file handles. Must be positive.
This is immutable
This is immutable. Only used for file uploads when cacheToDisk=True.
enableBatchUpload (bool): Enable batch URL uploading (two-step process). Defaults to True.
batchSize (int): Number of URLs per batch (10-500). Defaults to 100.
batchPollInterval (float): Polling interval in seconds. Defaults to 0.5.
"""

model_config = ConfigDict(validate_assignment=True)

maxWorkers: int = Field(default=25)
maxRetries: int = Field(default=3)
cacheUploads: bool = Field(default=True)
cacheTimeout: float = Field(default=0.1)
cacheToDisk: bool = Field(
default=True,
description="Enable disk-based caching for file uploads. URLs are always cached in-memory.",
)
cacheTimeout: float = Field(default=1)
cacheLocation: Path = Field(
default=Path.home() / ".cache" / "rapidata" / "upload_cache",
frozen=True,
Expand All @@ -32,6 +42,14 @@ class UploadConfig(BaseModel):
default=128,
frozen=True,
)
batchSize: int = Field(
default=1000,
description="Number of URLs per batch (100-5000)",
)
batchPollInterval: float = Field(
default=0.5,
description="Polling interval in seconds",
)

@field_validator("maxWorkers")
@classmethod
Expand All @@ -54,6 +72,13 @@ def validate_cache_shards(cls, v: int) -> int:
)
return v

@field_validator("batchSize")
@classmethod
def validate_batch_size(cls, v: int) -> int:
if v < 100:
raise ValueError("batchSize must be at least 100")
return v

def __init__(self, **kwargs):
super().__init__(**kwargs)
self._migrate_cache()
Expand Down
Loading