Quickstart
This example creates a tiny text dataset, discovers a shared schema, and then generates per-class feature values.
from pathlib import Path
from llm_feature_gen import (
discover_features_from_texts,
generate_features_from_texts,
)
discover_samples = {
"demo_discover_texts/sample1.txt": "The dish was rich, spicy, and served in a deep bowl.",
"demo_discover_texts/sample2.txt": "The dessert was light, creamy, and topped with fresh fruit.",
}
generation_samples = {
"demo_texts/positive/review1.txt": "The meal was vibrant, aromatic, and beautifully plated.",
"demo_texts/negative/review1.txt": "The service was slow and the food arrived cold.",
}
for relative_path, text in {**discover_samples, **generation_samples}.items():
path = Path(relative_path)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(text, encoding="utf-8")
discovered = discover_features_from_texts("demo_discover_texts")
csv_paths = generate_features_from_texts(
root_folder="demo_texts",
merge_to_single_csv=True,
)
print(discovered)
print(csv_paths)
Expected outputs:
outputs/discovered_text_features.jsonoutputs/positive_feature_values.csvoutputs/negative_feature_values.csvoutputs/all_feature_values.csv
Common patterns
Use a provider explicitly when you want to switch backends:
from llm_feature_gen import discover_features_from_images
from llm_feature_gen.providers import LocalProvider
provider = LocalProvider()
result = discover_features_from_images(
image_paths_or_folder="discover_images",
provider=provider,
)
For tabular datasets, pass the text-bearing column during both discovery and generation:
from llm_feature_gen import (
discover_features_from_tabular,
generate_features_from_tabular,
)
discover_features_from_tabular("discover_tabular/test.csv", text_column="review")
generate_features_from_tabular(
root_folder="tabular",
text_column="review",
label_column="label",
)