Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions api/core/app/apps/pipeline/pipeline_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from core.datasource.online_drive.online_drive_plugin import OnlineDriveDatasourcePlugin
from core.entities.knowledge_entities import PipelineDataset, PipelineDocument
from core.model_runtime.errors.invoke import InvokeAuthorizationError
from core.rag.index_processor.constant.built_in_field import BuiltInField
from core.rag.index_processor.constant.built_in_field import BuiltInField, get_safe_data_source_value
from core.repositories.factory import DifyCoreRepositoryFactory
from core.workflow.repositories.draft_variable_repository import DraftVariableSaverFactory
from core.workflow.repositories.workflow_execution_repository import WorkflowExecutionRepository
Expand Down Expand Up @@ -726,10 +726,10 @@ def _build_document(
if built_in_field_enabled:
doc_metadata = {
BuiltInField.document_name: name,
BuiltInField.uploader: account.name,
BuiltInField.uploader: account.name or "Unknown",
Copy link

Copilot AI Oct 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Using a hardcoded string 'Unknown' for missing uploader data may cause issues with data consistency and internationalization. Consider defining this as a constant (e.g., UNKNOWN_UPLOADER = 'Unknown') at the module level or in a constants file to ensure consistency across the codebase and easier maintenance.

Copilot uses AI. Check for mistakes.
BuiltInField.upload_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
BuiltInField.source: datasource_type,
BuiltInField.source: get_safe_data_source_value(datasource_type),
}
if doc_metadata:
document.doc_metadata = doc_metadata
Expand Down
22 changes: 22 additions & 0 deletions api/core/rag/index_processor/constant/built_in_field.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import logging
from enum import StrEnum, auto

logger = logging.getLogger(__name__)


class BuiltInField(StrEnum):
document_name = auto()
Expand All @@ -15,3 +18,22 @@ class MetadataDataSource(StrEnum):
notion_import = "notion"
local_file = "file_upload"
online_document = "online_document"
online_drive = "online_drive"


def get_safe_data_source_value(data_source_type: str) -> str:
"""
Safely get data source value for metadata.
Returns the mapped value if exists in enum, otherwise returns the original value.

Args:
data_source_type: The data source type string

Returns:
The mapped enum value if exists, otherwise the original value
"""
try:
return MetadataDataSource[data_source_type].value
Copy link

Copilot AI Oct 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function accesses the enum using bracket notation which will raise KeyError if the key doesn't exist, but the docstring states it should return the original value in that case. The try-except block catches KeyError correctly, but line 36 should use .get() method or the current approach is fine. However, the actual issue is that MetadataDataSource[data_source_type] returns an enum member, and calling .value on it is correct. But since this is using bracket notation on an Enum class (not a dict), this is actually correct behavior for enum member access. The exception handling is appropriate.

Copilot uses AI. Check for mistakes.
except KeyError:
logger.warning("Unknown data source type: %s, using original value", data_source_type)
return data_source_type
6 changes: 3 additions & 3 deletions api/services/dataset_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from core.helper.name_generator import generate_incremental_name
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from core.rag.index_processor.constant.built_in_field import BuiltInField
from core.rag.index_processor.constant.built_in_field import BuiltInField, get_safe_data_source_value
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from events.dataset_event import dataset_was_deleted
Expand Down Expand Up @@ -2010,10 +2010,10 @@ def build_document(
if dataset.built_in_field_enabled:
doc_metadata = {
BuiltInField.document_name: name,
BuiltInField.uploader: account.name,
BuiltInField.uploader: account.name or "Unknown",
Copy link

Copilot AI Oct 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Using a hardcoded string 'Unknown' for missing uploader data may cause issues with data consistency and internationalization. Consider defining this as a constant (e.g., UNKNOWN_UPLOADER = 'Unknown') at the module level or in a constants file to ensure consistency across the codebase and easier maintenance.

Copilot uses AI. Check for mistakes.
BuiltInField.upload_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
BuiltInField.source: data_source_type,
BuiltInField.source: get_safe_data_source_value(data_source_type),
}
if doc_metadata:
document.doc_metadata = doc_metadata
Expand Down
41 changes: 29 additions & 12 deletions api/services/metadata_service.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import copy
import logging

from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
from core.rag.index_processor.constant.built_in_field import (
BuiltInField,
get_safe_data_source_value,
)
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from libs.datetime_utils import naive_utc_now
Expand Down Expand Up @@ -45,7 +48,7 @@ def create_metadata(dataset_id: str, metadata_args: MetadataArgs) -> DatasetMeta
return metadata

@staticmethod
def update_metadata_name(dataset_id: str, metadata_id: str, name: str) -> DatasetMetadata: # type: ignore
def update_metadata_name(dataset_id: str, metadata_id: str, name: str) -> DatasetMetadata | None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The return types of update_metadata_name and delete_metadata have been changed to return ... | None instead of raising a ValueError when an item is not found. This is a breaking change to their contracts. It's crucial that all callers are updated to handle the None return value to prevent TypeError or other unexpected behavior. This pull request does not seem to include updates to the call sites, which is risky.

# check if metadata name is too long
if len(name) > 255:
raise ValueError("Metadata name cannot exceed 255 characters.")
Expand All @@ -66,7 +69,7 @@ def update_metadata_name(dataset_id: str, metadata_id: str, name: str) -> Datase
MetadataService.knowledge_base_metadata_lock_check(dataset_id, None)
metadata = db.session.query(DatasetMetadata).filter_by(id=metadata_id).first()
if metadata is None:
raise ValueError("Metadata not found.")
return None
old_name = metadata.name
metadata.name = name
metadata.updated_by = current_user.id
Expand All @@ -91,18 +94,20 @@ def update_metadata_name(dataset_id: str, metadata_id: str, name: str) -> Datase
db.session.commit()
return metadata
except Exception:
db.session.rollback()
logger.exception("Update metadata name failed")
raise
finally:
redis_client.delete(lock_key)
Comment on lines 96 to 101
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The pattern of try...except...finally for database transactions and Redis locks is repeated in update_metadata_name, delete_metadata, enable_built_in_field, disable_built_in_field, and update_documents_metadata. This code duplication can make future maintenance difficult.

To adhere to the DRY (Don't Repeat Yourself) principle, you could abstract this pattern into a reusable context manager. This would make the intent of the code clearer and reduce the chance of errors.

Here's an example of how it could look:

from contextlib import contextmanager

@contextmanager
def managed_transaction(lock_key: str, error_message: str):
    try:
        yield
        db.session.commit()
    except Exception:
        db.session.rollback()
        logger.exception(error_message)
        raise
    finally:
        if redis_client.get(lock_key):
            redis_client.delete(lock_key)

Using this, the update_metadata_name method would be simplified, improving readability.


@staticmethod
def delete_metadata(dataset_id: str, metadata_id: str):
def delete_metadata(dataset_id: str, metadata_id: str) -> DatasetMetadata | None:
lock_key = f"dataset_metadata_lock_{dataset_id}"
try:
MetadataService.knowledge_base_metadata_lock_check(dataset_id, None)
metadata = db.session.query(DatasetMetadata).filter_by(id=metadata_id).first()
if metadata is None:
raise ValueError("Metadata not found.")
return None
db.session.delete(metadata)

# deal related documents
Expand All @@ -123,7 +128,9 @@ def delete_metadata(dataset_id: str, metadata_id: str):
db.session.commit()
return metadata
except Exception:
db.session.rollback()
logger.exception("Delete metadata failed")
raise
finally:
redis_client.delete(lock_key)

Expand Down Expand Up @@ -153,16 +160,18 @@ def enable_built_in_field(dataset: Dataset):
else:
doc_metadata = copy.deepcopy(document.doc_metadata)
doc_metadata[BuiltInField.document_name] = document.name
doc_metadata[BuiltInField.uploader] = document.uploader
doc_metadata[BuiltInField.uploader] = document.uploader or "Unknown"
Copy link

Copilot AI Oct 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Using a hardcoded string 'Unknown' for missing uploader data may cause issues with data consistency and internationalization. Consider defining this as a constant (e.g., UNKNOWN_UPLOADER = 'Unknown') at the module level or in a constants file to ensure consistency across the codebase and easier maintenance.

Copilot uses AI. Check for mistakes.
doc_metadata[BuiltInField.upload_date] = document.upload_date.timestamp()
doc_metadata[BuiltInField.last_update_date] = document.last_update_date.timestamp()
doc_metadata[BuiltInField.source] = MetadataDataSource[document.data_source_type]
doc_metadata[BuiltInField.source] = get_safe_data_source_value(document.data_source_type)
document.doc_metadata = doc_metadata
db.session.add(document)
dataset.built_in_field_enabled = True
db.session.commit()
except Exception:
db.session.rollback()
logger.exception("Enable built-in field failed")
raise
finally:
redis_client.delete(lock_key)

Expand Down Expand Up @@ -193,7 +202,9 @@ def disable_built_in_field(dataset: Dataset):
dataset.built_in_field_enabled = False
db.session.commit()
except Exception:
db.session.rollback()
logger.exception("Disable built-in field failed")
raise
finally:
redis_client.delete(lock_key)

Expand All @@ -203,21 +214,25 @@ def update_documents_metadata(dataset: Dataset, metadata_args: MetadataOperation
lock_key = f"document_metadata_lock_{operation.document_id}"
try:
MetadataService.knowledge_base_metadata_lock_check(None, operation.document_id)
document = DocumentService.get_document(dataset.id, operation.document_id)
try:
document = DocumentService.get_document(dataset.id, operation.document_id)
except Exception as e:
logger.warning("Failed to get document %s: %s", operation.document_id, str(e))
continue
Comment on lines +219 to +221
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Catching a broad Exception and then continuing the loop can mask serious underlying problems. For instance, if the database connection is lost, this will log a warning for every document in the batch and continue, instead of failing fast. It's better to catch more specific, expected exceptions for a single-item failure and let unexpected exceptions (like database connection errors) propagate to the outer except block. This would correctly halt the entire batch operation on a fatal error, which is the safer default behavior.

if document is None:
raise ValueError("Document not found.")
logger.warning("Document not found: %s", operation.document_id)
continue
doc_metadata = {}
for metadata_value in operation.metadata_list:
doc_metadata[metadata_value.name] = metadata_value.value
if dataset.built_in_field_enabled:
doc_metadata[BuiltInField.document_name] = document.name
doc_metadata[BuiltInField.uploader] = document.uploader
doc_metadata[BuiltInField.uploader] = document.uploader or "Unknown"
doc_metadata[BuiltInField.upload_date] = document.upload_date.timestamp()
doc_metadata[BuiltInField.last_update_date] = document.last_update_date.timestamp()
doc_metadata[BuiltInField.source] = MetadataDataSource[document.data_source_type]
doc_metadata[BuiltInField.source] = get_safe_data_source_value(document.data_source_type)
document.doc_metadata = doc_metadata
db.session.add(document)
db.session.commit()
# deal metadata binding
db.session.query(DatasetMetadataBinding).filter_by(document_id=operation.document_id).delete()
current_user, current_tenant_id = current_account_with_tenant()
Expand All @@ -232,7 +247,9 @@ def update_documents_metadata(dataset: Dataset, metadata_args: MetadataOperation
db.session.add(dataset_metadata_binding)
db.session.commit()
except Exception:
db.session.rollback()
logger.exception("Update documents metadata failed")
raise
Comment on lines 234 to +252
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The update_documents_metadata function processes a batch of metadata updates. Inside the loop, if an exception occurs during the update of a single document's metadata, the transaction for that document is rolled back, and the exception is re-raised, which terminates the entire batch operation. This is a "fail-fast" strategy.

For a more robust batch operation, you might consider a strategy where the failure of one document does not stop the processing of others. You could collect the errors for each failed document and report them all at the end, allowing successful updates to proceed.

finally:
redis_client.delete(lock_key)

Expand Down