diff --git a/Campaign/management/commands/ExportSystemScoresToCSV.py b/Campaign/management/commands/ExportSystemScoresToCSV.py index 6116d238..87b790cd 100644 --- a/Campaign/management/commands/ExportSystemScoresToCSV.py +++ b/Campaign/management/commands/ExportSystemScoresToCSV.py @@ -1,9 +1,22 @@ # pylint: disable=C0103,C0111,C0330,E1101 +""" +Django management command to export system scores to CSV format. + +This command supports exporting scores from either: +1. Active user accounts (default behavior) +2. Reset accounts only (shadow users created during task agenda resets) + +The --reset-accounts-only option allows you to export scores from previous +annotation rounds before task agendas were reset, enabling comparison +between different annotation phases. +""" import csv import sys +from re import compile as re_compile from django.core.management.base import BaseCommand from django.core.management.base import CommandError +from django.contrib.auth.models import User from Campaign.models import Campaign from EvalData.models import TASK_DEFINITIONS @@ -11,8 +24,114 @@ CAMPAIGN_TASK_PAIRS = {(tup[1], tup[2]) for tup in TASK_DEFINITIONS} +def get_shadow_users(): + """ + Returns a list of shadow users (inactive users created during task agenda resets). + Shadow users follow the pattern: {original_username}-{hex_number} + """ + shadow_pattern = re_compile(r'^[^-]+-[0-9a-f]{2}$') + all_users = User.objects.filter(is_active=False) + shadow_users = [user for user in all_users if shadow_pattern.match(user.username)] + return shadow_users + + +def validate_reset_accounts(command_instance): + """ + Validates that shadow users exist for reset accounts export. + Returns shadow users list or None if validation fails. + """ + shadow_users = get_shadow_users() + if not shadow_users: + command_instance.stderr.write( + '# No reset accounts (shadow users) found. ' + 'This means no task agendas have been reset yet.' + ) + return None + + shadow_usernames = [user.username for user in shadow_users] + command_instance.stderr.write( + f'# Found {len(shadow_users)} reset accounts (shadow users): ' + f'{", ".join(shadow_usernames)}. ' + f'Exporting scores from these accounts only.' + ) + return shadow_users + + +def get_scores_from_reset_accounts(result_cls, campaign, options, command_instance): + """ + Retrieves scores from shadow users (reset accounts) for the given result class. + + Args: + result_cls: The result model class to query + campaign: Campaign instance + options: Command options dictionary + command_instance: Command instance for logging + + Returns: + List of filtered scores from shadow users + """ + shadow_users = get_shadow_users() + if not shadow_users: + return [] + + shadow_usernames = {user.username for user in shadow_users} + + # Use the include_retired parameter to get retired annotations + all_scores = result_cls.get_system_data( + campaign.id, + extended_csv=True, + add_batch_info=options['batch_info'], + include_inactive=True, # Include inactive users (shadow users) + include_retired=True, # Include retired annotations + ) + + command_instance.stderr.write( + f"# Found {len(all_scores)} retired annotations from {result_cls.__name__}" + ) + + # Filter to only include scores from shadow users + filtered_scores = [] + for score in all_scores: + # Assuming the username is the first element in the score tuple + if len(score) > 0 and score[0] in shadow_usernames: + filtered_scores.append(score) + + return filtered_scores + + +def get_scores_from_active_accounts(result_cls, campaign, options): + """ + Retrieves scores from active user accounts (normal behavior). + + Args: + result_cls: The result model class to query + campaign: Campaign instance + options: Command options dictionary + + Returns: + List of scores from active users + """ + return result_cls.get_system_data( + campaign.id, + extended_csv=True, + add_batch_info=options['batch_info'], + ) + + class Command(BaseCommand): - help = 'Exports system scores over all results to CSV format' + help = 'Exports system scores over all results to CSV format. Use --reset-accounts-only to export scores from previous annotation rounds (before task agenda resets).' + + """ + Usage examples: + + # Export scores from active users (default behavior): + python manage.py ExportSystemScoresToCSV my_campaign --completed-only --batch-info + + # Export scores from reset accounts only (previous annotation rounds): + python manage.py ExportSystemScoresToCSV my_campaign --reset-accounts-only --completed-only + + # Compare results by running both commands and analyzing the differences + """ def add_arguments(self, parser): parser.add_argument( @@ -30,18 +149,30 @@ def add_arguments(self, parser): action='store_true', help='Export batch and item IDs to help matching the scores to items in the JSON batches', ) + parser.add_argument( + '--reset-accounts-only', + action='store_true', + help='Export scores only from reset accounts (shadow users created during task agenda resets)', + ) # TODO: add argument to specify batch user def handle(self, *args, **options): # Identify Campaign instance for given name. try: campaign = Campaign.get_campaign_or_raise(options['campaign_name']) - except LookupError as error: raise CommandError(error) + # Validate reset accounts if requested + if options['reset_accounts_only']: + shadow_users = validate_reset_accounts(self) + if shadow_users is None: + return + csv_writer = csv.writer(sys.stdout, quoting=csv.QUOTE_MINIMAL) system_scores = [] + total_scores_exported = 0 + for task_cls, result_cls in CAMPAIGN_TASK_PAIRS: qs_name = task_cls.__name__.lower() qs_attr = f'evaldata_{qs_name}_campaign' @@ -52,12 +183,20 @@ def handle(self, *args, **options): qs_obj = qs_obj.filter(completed=True) if qs_obj and qs_obj.exists(): - _scores = result_cls.get_system_data( - campaign.id, - extended_csv=True, - add_batch_info=options['batch_info'], - ) + if options['reset_accounts_only']: + _scores = get_scores_from_reset_accounts(result_cls, campaign, options, self) + else: + _scores = get_scores_from_active_accounts(result_cls, campaign, options) + + total_scores_exported += len(_scores) system_scores.extend(_scores) + # Write all scores to CSV for system_score in system_scores: csv_writer.writerow([str(x) for x in system_score]) + + # Print summary to stderr so it doesn't interfere with CSV output + account_type = "reset accounts" if options['reset_accounts_only'] else "active accounts" + self.stderr.write( + f"# Exported {total_scores_exported} scores from {account_type} for campaign '{campaign.campaignName}'" + ) diff --git a/EvalData/models/data_assessment.py b/EvalData/models/data_assessment.py index c10df8eb..0b500c55 100644 --- a/EvalData/models/data_assessment.py +++ b/EvalData/models/data_assessment.py @@ -846,6 +846,7 @@ def get_system_data( expand_multi_sys=True, include_inactive=False, add_batch_info=False, + include_retired=False, ): system_data = [] @@ -853,7 +854,11 @@ def get_system_data( if extended_csv: item_types += ('BAD', 'REF') - qs = cls.objects.filter(completed=True, item__itemType__in=item_types) + # Filter by completion status - either completed or retired + if include_retired: + qs = cls.objects.filter(retired=True, item__itemType__in=item_types) + else: + qs = cls.objects.filter(completed=True, item__itemType__in=item_types) # If campaign ID is given, only return results for this campaign. if campaign_id: diff --git a/EvalData/models/direct_assessment.py b/EvalData/models/direct_assessment.py index c52e4b34..72f109ad 100644 --- a/EvalData/models/direct_assessment.py +++ b/EvalData/models/direct_assessment.py @@ -710,6 +710,7 @@ def get_system_data( expand_multi_sys=True, include_inactive=False, add_batch_info=False, + include_retired=False, ): system_data = [] @@ -717,7 +718,11 @@ def get_system_data( if extended_csv: item_types += ('BAD', 'REF') - qs = cls.objects.filter(completed=True, item__itemType__in=item_types) + # Filter by completion status - either completed or retired + if include_retired: + qs = cls.objects.filter(retired=True, item__itemType__in=item_types) + else: + qs = cls.objects.filter(completed=True, item__itemType__in=item_types) # If campaign ID is given, only return results for this campaign. if campaign_id: diff --git a/EvalData/models/direct_assessment_context.py b/EvalData/models/direct_assessment_context.py index aa76c6d2..00ec5699 100644 --- a/EvalData/models/direct_assessment_context.py +++ b/EvalData/models/direct_assessment_context.py @@ -793,6 +793,7 @@ def get_system_data( expand_multi_sys=True, include_inactive=False, add_batch_info=False, + include_retired=False, ): system_data = [] @@ -800,7 +801,11 @@ def get_system_data( if extended_csv: item_types += ('BAD', 'REF') - qs = cls.objects.filter(completed=True, item__itemType__in=item_types) + # Filter by completion status - either completed or retired + if include_retired: + qs = cls.objects.filter(retired=True, item__itemType__in=item_types) + else: + qs = cls.objects.filter(completed=True, item__itemType__in=item_types) # If campaign ID is given, only return results for this campaign. if campaign_id: diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py index 58843051..0cf8d075 100644 --- a/EvalData/models/direct_assessment_document.py +++ b/EvalData/models/direct_assessment_document.py @@ -923,6 +923,7 @@ def get_system_data( expand_multi_sys=True, include_inactive=False, add_batch_info=False, + include_retired=False, ): system_data = [] @@ -930,7 +931,11 @@ def get_system_data( if extended_csv: item_types += ('BAD', 'REF') - qs = cls.objects.filter(completed=True, item__itemType__in=item_types) + # Filter by completion status - either completed or retired + if include_retired: + qs = cls.objects.filter(retired=True, item__itemType__in=item_types) + else: + qs = cls.objects.filter(completed=True, item__itemType__in=item_types) # If campaign ID is given, only return results for this campaign. if campaign_id: diff --git a/EvalData/models/pairwise_assessment.py b/EvalData/models/pairwise_assessment.py index ef7e1cf2..2f514618 100644 --- a/EvalData/models/pairwise_assessment.py +++ b/EvalData/models/pairwise_assessment.py @@ -787,6 +787,7 @@ def get_system_data( expand_multi_sys=True, include_inactive=False, add_batch_info=False, + include_retired=False, ): system_data = [] @@ -794,7 +795,11 @@ def get_system_data( if extended_csv: item_types += ('BAD', 'REF') - qs = cls.objects.filter(completed=True, item__itemType__in=item_types) + # Filter by completion status - either completed or retired + if include_retired: + qs = cls.objects.filter(retired=True, item__itemType__in=item_types) + else: + qs = cls.objects.filter(completed=True, item__itemType__in=item_types) # print('Found completed items: {0}'.format(len(qs))) # If campaign ID is given, only return results for this campaign. diff --git a/EvalData/models/pairwise_assessment_document.py b/EvalData/models/pairwise_assessment_document.py index b538de4a..9c3a7c25 100644 --- a/EvalData/models/pairwise_assessment_document.py +++ b/EvalData/models/pairwise_assessment_document.py @@ -939,6 +939,7 @@ def get_system_data( expand_multi_sys=True, include_inactive=False, add_batch_info=False, + include_retired=False, ): system_data = [] @@ -946,7 +947,11 @@ def get_system_data( if extended_csv: item_types += ('BAD', 'REF') - qs = cls.objects.filter(completed=True, item__itemType__in=item_types) + # Filter by completion status - either completed or retired + if include_retired: + qs = cls.objects.filter(retired=True, item__itemType__in=item_types) + else: + qs = cls.objects.filter(completed=True, item__itemType__in=item_types) # If campaign ID is given, only return results for this campaign. if campaign_id: