Spaces:
Running
Running
New design
Browse files- app.py +248 -330
- model_data/model_a_data.json +220 -0
- model_data/model_b_data.json +471 -0
- model_data/model_c_data.json +417 -0
- scorecard_templates/bias_stereotypes_representation.json +84 -0
- scorecard_templates/cultural_values_sensitive_content.json +71 -0
- scorecard_templates/data_content_labor.json +92 -0
- scorecard_templates/disparate_performance.json +78 -0
- scorecard_templates/environmental_costs.json +65 -0
- scorecard_templates/financial_costs.json +81 -0
- scorecard_templates/privacy_data_protection.json +91 -0
app.py
CHANGED
|
@@ -2,315 +2,173 @@ import gradio as gr
|
|
| 2 |
import pandas as pd
|
| 3 |
import plotly.express as px
|
| 4 |
from dataclasses import dataclass, field
|
| 5 |
-
from typing import List, Dict, Tuple
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
@dataclass
|
| 8 |
class ScorecardCategory:
|
| 9 |
name: str
|
| 10 |
-
questions: List[
|
| 11 |
-
category_explainer: str
|
| 12 |
scores: Dict[str, int] = field(default_factory=dict)
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
(
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
("Multi-language and cultural evaluation", "Look for evaluations that test the model's performance and biases across different languages and cultures, not just in the dominant language/culture of the model's origin."),
|
| 26 |
-
("Text-to-image language impact", "This applies to multimodal models. Look for tests using prompts in various languages and writing systems to generate images."),
|
| 27 |
-
("Cultural context shifts", "Some categories (e.g., race, ethnicity) may be defined differently in different cultures. Look for evaluations that acknowledge and account for these differences."),
|
| 28 |
-
("Evaluator diversity", "Look for information about the demographic makeup of the evaluation team and any measures taken to mitigate evaluator bias."),
|
| 29 |
-
("Harmful association detection", "This could include tests for stereotypical word associations in text models or stereotypical visual representations in image models."),
|
| 30 |
-
("Sentiment and toxicity analysis", "Look for evaluations that measure the model's tendency to produce negative sentiment or toxic content when discussing certain groups."),
|
| 31 |
-
("False positive mitigation", "False positives occur when non-stereotypical content is flagged as stereotypical. Look for evaluations that consider this possibility and attempt to mitigate it."),
|
| 32 |
-
("Image generation bias consistency", "This applies to image generation models. Look for evaluations that analyze patterns across multiple generated images to identify consistent biases."),
|
| 33 |
-
("Contextual bias acknowledgment", "Look for discussions about how bias can change over time or in different contexts, and how this impacts the evaluation."),
|
| 34 |
-
("Evaluation limitations disclosure", "Look for transparent discussions about what the evaluation methods can and cannot detect or measure."),
|
| 35 |
-
("Evaluation tool bias transparency", "If the evaluation uses other AI tools (e.g., for sentiment analysis), look for acknowledgment of potential biases in these tools."),
|
| 36 |
-
("Bias amplification discussion", "Look for analyses of how model size, training techniques, or other technical decisions might amplify existing biases in the data or model.")
|
| 37 |
-
],
|
| 38 |
-
"This category assesses the model's handling of bias, stereotypes, and representational harms across various dimensions and contexts."
|
| 39 |
-
),
|
| 40 |
-
ScorecardCategory(
|
| 41 |
-
"Cultural Values and Sensitive Content",
|
| 42 |
-
[
|
| 43 |
-
("Cross-cultural evaluation", "Look for evaluations that test the model's outputs in various cultural settings, not just in the dominant culture of the model's origin."),
|
| 44 |
-
("Intra-country cultural diversity", "Look for evaluations that acknowledge and assess different cultural values that can exist within a single country, rather than treating each country as culturally homogeneous."),
|
| 45 |
-
("Language-specific cultural stereotypes", "Look for tests that assess how cultural stereotypes might manifest differently across languages used by the model."),
|
| 46 |
-
("Participatory cultural evaluation", "Look for evaluations that engage people from various cultures in the assessment process, rather than relying solely on predefined frameworks."),
|
| 47 |
-
("Culture-specific sensitive topics", "Look for evaluations that recognize that sensitive topics can vary by culture and assess the model's performance accordingly."),
|
| 48 |
-
("Hate speech detection across cultures", "Look for evaluations that test hate speech detection across different languages and cultural norms."),
|
| 49 |
-
("Indirect harmful content", "Look for evaluations that examine less overt forms of harmful content, such as microaggressions or coded language."),
|
| 50 |
-
("Intersectional harm assessment", "Look for evaluations that examine how different aspects of identity (e.g., race, gender, religion) might interact to produce unique forms of harmful content."),
|
| 51 |
-
("Cultural value frameworks", "Look for evaluations that leverage recognized frameworks for understanding cultural differences."),
|
| 52 |
-
("Evolving cultural norms", "Look for evaluations that acknowledge the dynamic nature of cultural values and assess the model's adaptability."),
|
| 53 |
-
("Cultural context in multimodal outputs", "Look for evaluations that examine how cultural context is maintained (or lost) when translating between text, image, audio, or video."),
|
| 54 |
-
("Humor and cultural sensitivity", "Look for evaluations that assess whether the model can generate or interpret culturally appropriate humor without causing offense."),
|
| 55 |
-
("Cultural bias in data", "Look for assessments of how the cultural makeup of the training data might influence the model's outputs."),
|
| 56 |
-
("Fairness across cultures", "Look for evaluations that examine whether the model performs equally well for different cultural groups."),
|
| 57 |
-
("Geopolitical neutrality", "Look for evaluations that examine whether the model shows bias towards particular geopolitical viewpoints."),
|
| 58 |
-
("Cultural appropriation", "Look for assessments of whether the model inappropriately uses or misrepresents cultural elements."),
|
| 59 |
-
("Cultural limitation disclosure", "Look for transparent discussions about which cultures the model is well-equipped to handle and where it might fall short."),
|
| 60 |
-
("Evaluation tool cultural bias", "Look for acknowledgment of how the tools used for evaluation (e.g., toxicity detection APIs) might have their own cultural biases."),
|
| 61 |
-
("Psychological impact consideration", "Look for discussions about measures taken to protect the well-being of human evaluators involved in assessing potentially distressing content."),
|
| 62 |
-
("Ongoing cultural evaluation commitment", "Look for plans or processes for continual assessment of cultural impacts as the model is updated or deployed in new contexts.")
|
| 63 |
-
],
|
| 64 |
-
"This category evaluates the model's sensitivity to diverse cultural values and its handling of culturally sensitive content."
|
| 65 |
-
),
|
| 66 |
-
ScorecardCategory(
|
| 67 |
-
"Disparate Performance",
|
| 68 |
-
[
|
| 69 |
-
("Dataset skew assessment", "Look for analyses of how well different groups are represented in the dataset used to train the model."),
|
| 70 |
-
("Geographic bias in data collection", "Look for examinations of how data availability might differ across different geographic regions."),
|
| 71 |
-
("Digital divide consideration", "Look for assessments of how differences in internet access across populations might impact the model's performance."),
|
| 72 |
-
("Content filter bias", "Look for analyses of how content filtering during data collection might disproportionately affect certain groups."),
|
| 73 |
-
("Cross-lingual performance", "Look for evaluations that test the model on standard benchmarks across different languages."),
|
| 74 |
-
("Dialect and accent evaluation", "For speech or text models, look for evaluations that test performance on various dialects or accents within a language."),
|
| 75 |
-
("Low-resource language performance", "Look for evaluations that test the model's capabilities in languages with limited digital presence or fewer speakers."),
|
| 76 |
-
("Multilingual knowledge retrieval", "Look for evaluations that test the model's capacity to access and utilize information in different languages."),
|
| 77 |
-
("Disaggregated performance metrics", "Look for detailed breakdowns of performance metrics (e.g., accuracy, precision, recall) for various subgroups."),
|
| 78 |
-
("Worst-case subgroup performance", "Look for analyses that highlight and quantify performance for the most disadvantaged subgroups."),
|
| 79 |
-
("Intersectional performance analysis", "Look for evaluations that examine how performance varies across intersections of different subgroup characteristics (e.g., race and gender)."),
|
| 80 |
-
("Subgroup coverage metrics", "Look for metrics that show how comprehensively different subgroups have been identified and included in the evaluation."),
|
| 81 |
-
("Image generation quality across concepts", "Look for assessments of how image quality might vary when generating images related to different cultural or demographic groups."),
|
| 82 |
-
("Hallucination disparity", "Look for evaluations that examine whether the model is more likely to produce false or unsupported information for some groups compared to others."),
|
| 83 |
-
("Cultural accuracy in image recognition", "Look for evaluations that test whether the model accurately identifies or describes cultural elements across different groups."),
|
| 84 |
-
("Realism disparity in generation", "Look for assessments of whether generated content (text, images, etc.) is equally realistic or high-quality across different demographic or cultural categories."),
|
| 85 |
-
("Intervention impact assessment", "Look for analyses of how attempts to address one form of bias or disparity might have unintended consequences for other groups."),
|
| 86 |
-
("Synthetic data impact", "Look for evaluations that examine whether using AI-generated data in training creates or exacerbates performance disparities."),
|
| 87 |
-
("Feature predictiveness analysis", "Look for analyses of whether certain features are more or less predictive for different groups, potentially leading to performance disparities."),
|
| 88 |
-
("Conceptualization of performance", "Look for discussions or analyses that question whether standard performance metrics adequately capture the needs and experiences of all affected groups.")
|
| 89 |
-
],
|
| 90 |
-
"This category examines potential disparities in the model's performance across different groups and contexts."
|
| 91 |
-
),
|
| 92 |
-
ScorecardCategory(
|
| 93 |
-
"Environmental Costs and Carbon Emissions",
|
| 94 |
-
[
|
| 95 |
-
("Training phase energy consumption", "Look for assessments of the total energy used during the model's initial training period."),
|
| 96 |
-
("Inference phase energy consumption", "Look for assessments of the ongoing energy use when the model is actively being used for predictions or generations."),
|
| 97 |
-
("Carbon footprint calculation", "Look for estimations of greenhouse gas emissions associated with the model's training and deployment, potentially using tools like CodeCarbon or Carbontracker."),
|
| 98 |
-
("Energy source consideration", "Look for assessments that take into account the type of energy powering the computing resources."),
|
| 99 |
-
("Hardware efficiency assessment", "Look for analyses of the energy consumption of specific hardware components used for training and inference."),
|
| 100 |
-
("Data center efficiency", "Look for assessments of the overall energy efficiency of the computing facilities, including cooling systems."),
|
| 101 |
-
("Hardware lifecycle assessment", "Look for analyses that include the broader lifecycle costs of the computing infrastructure, not just operational energy use."),
|
| 102 |
-
("Memory usage optimization", "Look for analyses of how efficiently the model uses memory resources and any optimizations made to reduce energy consumption."),
|
| 103 |
-
("Model size and efficiency trade-off", "Look for analyses of how model size (e.g., number of parameters) affects energy consumption and whether more efficient architectures have been considered."),
|
| 104 |
-
("Fine-tuning vs. pre-training efficiency", "Look for assessments of the energy trade-offs between adapting pre-trained models and training new models from scratch."),
|
| 105 |
-
("Task-specific energy consumption", "Look for analyses of how energy use varies depending on the specific tasks the model is performing."),
|
| 106 |
-
("Marginal cost analysis", "Look for assessments of how incremental improvements to the model affect its energy consumption."),
|
| 107 |
-
("Standardized reporting metrics", "Look for the use of widely accepted metrics such as FLOPS, energy consumption in kWh, or carbon emissions in CO2e."),
|
| 108 |
-
("Comprehensive measurement tools", "Look for the use of tools that capture a wide range of factors, such as experiment-impact-tracker or holistic Life Cycle Assessment (LCA) approaches."),
|
| 109 |
-
("Supply chain emissions", "Look for assessments that include indirect emissions from manufacturing, transportation, and other supply chain activities."),
|
| 110 |
-
("Transparency in reporting", "Look for clear explanations of how environmental impact figures were calculated, including any assumptions or limitations."),
|
| 111 |
-
("Energy efficiency improvements", "Look for documentation of strategies implemented to reduce energy consumption in subsequent versions or deployments of the model."),
|
| 112 |
-
("Carbon offsetting initiatives", "Look for information about programs to compensate for the model's carbon emissions through activities like reforestation or renewable energy investments."),
|
| 113 |
-
("Long-term environmental impact", "Look for analyses that project the potential environmental impact if the model or similar models become widely used in the future."),
|
| 114 |
-
("Integration of environmental considerations in model design", "Look for evidence that environmental impact is a key consideration from the early stages of model conceptualization and development.")
|
| 115 |
-
],
|
| 116 |
-
"This category assesses the environmental impact of the model, including energy consumption and carbon emissions throughout its lifecycle."
|
| 117 |
-
),
|
| 118 |
-
ScorecardCategory(
|
| 119 |
-
"Privacy and Data Protection",
|
| 120 |
-
[
|
| 121 |
-
("Active consent mechanisms", "Look for assessments of how the system obtains explicit user consent for collecting, processing, and sharing data."),
|
| 122 |
-
("Opt-in data collection", "Look for analyses of whether users must actively choose to share their data rather than having to opt out of data collection."),
|
| 123 |
-
("Data minimization practices", "Look for evaluations of whether the system collects only the data necessary for its stated purposes."),
|
| 124 |
-
("Retroactive data removal", "Look for assessments of whether the system can honor user requests to delete their data, including retraining if necessary."),
|
| 125 |
-
("Training data transparency", "Look for examinations of whether information about the sources and nature of training data is publicly available."),
|
| 126 |
-
("Copyright and licensed content", "Look for evaluations of whether the system respects intellectual property rights in its training data and outputs."),
|
| 127 |
-
("Personally Identifiable Information (PII) in training data", "Look for analyses of how the system identifies and protects PII within its training dataset."),
|
| 128 |
-
("Data deduplication efforts", "Look for assessments of techniques used to remove duplicate entries in the training data, which can reduce the risk of memorization."),
|
| 129 |
-
("Memorization assessment", "Look for tests that attempt to extract specific training examples or sensitive information from the model's outputs."),
|
| 130 |
-
("Out-of-distribution data revelation", "Look for evaluations of whether the model unexpectedly outputs information that wasn't intended to be part of its training."),
|
| 131 |
-
("PII generation prevention", "Look for tests of whether the model can recognize and refrain from outputting sensitive personal information."),
|
| 132 |
-
("Contextual privacy violations", "Look for evaluations of whether the model respects the appropriate context for revealing certain types of information."),
|
| 133 |
-
("Data encryption practices", "Look for assessments of how user data is encrypted both in transit and at rest."),
|
| 134 |
-
("Access control mechanisms", "Look for evaluations of how the system restricts access to sensitive data and functionalities."),
|
| 135 |
-
("Vulnerability to membership inference attacks", "Look for assessments of whether an attacker can determine if a particular data point was used in the model's training."),
|
| 136 |
-
("System prompt protection", "Look for evaluations of whether the model inadvertently reveals sensitive information contained in its system prompts."),
|
| 137 |
-
("Regulatory compliance", "Look for analyses of how well the system adheres to applicable data protection laws and regulations."),
|
| 138 |
-
("Privacy-preserving machine learning techniques", "Look for assessments of whether techniques like differential privacy or federated learning are implemented to enhance privacy."),
|
| 139 |
-
("Community-centered privacy definitions", "Look for evaluations that take into account different cultural and community perspectives on privacy, especially from marginalized groups."),
|
| 140 |
-
("Long-term privacy implications", "Look for analyses that project how privacy risks might evolve over time as the system is used and potentially combined with other data sources.")
|
| 141 |
-
],
|
| 142 |
-
"This category evaluates the model's adherence to privacy principles and data protection practices."
|
| 143 |
-
),
|
| 144 |
-
ScorecardCategory(
|
| 145 |
-
"Financial Costs",
|
| 146 |
-
[
|
| 147 |
-
("Training data storage costs", "Look for estimates of storage costs for the dataset used to train the model, considering factors like volume and storage type (e.g., in-house vs. cloud)."),
|
| 148 |
-
("Model storage costs", "Look for assessments of storage costs for the final model, which may vary based on model architecture and storage solutions."),
|
| 149 |
-
("Data preprocessing costs", "Look for estimates of costs related to preparing data for training, such as creating spectrograms for audio data or preprocessing images."),
|
| 150 |
-
("Data sourcing costs", "Look for assessments of expenses related to purchasing datasets, crowd-sourcing data collection, or other data acquisition methods."),
|
| 151 |
-
("Training hardware costs", "Look for evaluations of expenses related to GPUs, TPUs, or other specialized hardware used during model training."),
|
| 152 |
-
("Cloud computing costs", "If cloud services were used, look for assessments of expenses based on instance-hours or other cloud pricing models."),
|
| 153 |
-
("Training time costs", "Look for analyses that track compute costs over the duration of the training process, potentially identifying cost-saving opportunities."),
|
| 154 |
-
("Model size and cost relationship", "Look for assessments of how different model sizes (e.g., number of parameters) impact overall training expenses."),
|
| 155 |
-
("Hosting costs", "Look for evaluations of expenses related to making the model available for use, including server costs and potential cloud service fees."),
|
| 156 |
-
("Inference hardware costs", "Look for assessments of expenses related to the computing resources needed to run the model in production."),
|
| 157 |
-
("API usage costs", "For API-accessible models, look for analyses of how API calls are priced, potentially considering factors like token usage or request volume."),
|
| 158 |
-
("Scaling costs", "Look for assessments of how expenses might change as the model's usage grows, including costs for maintaining low latency and high availability."),
|
| 159 |
-
("Research and development labor costs", "Look for estimates of expenses related to the time spent by researchers and developers in creating and refining the model."),
|
| 160 |
-
("Crowd-worker costs", "If applicable, look for assessments of expenses related to hiring crowd workers for tasks like data labeling or model evaluation."),
|
| 161 |
-
("Ongoing maintenance labor costs", "Look for estimates of expenses related to continued model updates, fine-tuning, or other maintenance tasks."),
|
| 162 |
-
("Specialized expertise costs", "Look for evaluations of expenses related to hiring or consulting with domain experts or AI specialists."),
|
| 163 |
-
("Total cost of ownership analysis", "Look for assessments that combine all cost factors to provide a holistic view of the model's financial impact."),
|
| 164 |
-
("Cost optimization strategies", "Look for analyses of potential cost-saving measures, such as more efficient architectures or training procedures."),
|
| 165 |
-
("Long-term cost projections", "Look for assessments that forecast how costs might evolve over time, considering factors like technology improvements or changing demand."),
|
| 166 |
-
("Hidden cost identification", "Look for analyses that consider less obvious cost factors, such as environmental impact or opportunity costs.")
|
| 167 |
-
],
|
| 168 |
-
"This category assesses the financial implications of developing, deploying, and maintaining the model."
|
| 169 |
-
),
|
| 170 |
-
ScorecardCategory(
|
| 171 |
-
"Data and Content Moderation Labor",
|
| 172 |
-
[
|
| 173 |
-
("Adherence to established standards", "Look for assessments of how well the crowdwork practices align with recognized industry standards for fair labor."),
|
| 174 |
-
("Fair compensation", "Look for analyses of whether crowdworkers are paid fairly for their time and effort, considering factors like local living wages."),
|
| 175 |
-
("Working hours and breaks", "Look for evaluations of whether crowdworkers have reasonable working hours and adequate breaks, especially for tasks involving traumatic content."),
|
| 176 |
-
("Psychological support", "Look for assessments of whether immediate and long-term psychological support is provided, especially for workers exposed to traumatic content."),
|
| 177 |
-
("Crowdwork documentation", "Look for examinations of how well the role of crowdwork in dataset development is documented, potentially using frameworks like CrowdWorkSheets."),
|
| 178 |
-
("Demographic information", "Look for assessments of whether and how demographic information about crowdworkers is collected and reported."),
|
| 179 |
-
("Task instructions transparency", "Look for evaluations of whether the instructions provided to crowdworkers are well-documented and accessible for review."),
|
| 180 |
-
("Assessment and compensation transparency", "Look for analyses of how clearly the methods for evaluating and compensating crowdworkers are documented and communicated."),
|
| 181 |
-
("Exposure limits", "Look for examinations of whether there are policies in place to limit the amount of traumatic material workers are exposed to in a given session."),
|
| 182 |
-
("Content warning practices", "Look for assessments of whether crowdworkers are given adequate warnings before being exposed to potentially disturbing content."),
|
| 183 |
-
("Trauma support availability", "Look for evaluations of whether immediate trauma support is available for workers exposed to disturbing content."),
|
| 184 |
-
("Long-term health monitoring", "Look for assessments of whether there are systems in place to monitor and support the long-term mental health of workers regularly exposed to traumatic content."),
|
| 185 |
-
("Labor law compliance", "Look for examinations of how well the crowdwork practices align with local and international labor regulations."),
|
| 186 |
-
("Worker representation", "Look for assessments of whether crowdworkers have avenues to voice concerns or negotiate collectively."),
|
| 187 |
-
("Dispute resolution processes", "Look for evaluations of how conflicts or disagreements between crowdworkers and employers are handled and resolved."),
|
| 188 |
-
("Job security and continuity", "Look for assessments of whether crowdworkers have any guarantees of ongoing work or protections against sudden loss of income."),
|
| 189 |
-
("Ethical review processes", "Look for examinations of whether there are systems in place to review and ensure the ethical treatment of crowdworkers."),
|
| 190 |
-
("Worker feedback incorporation", "Look for assessments of whether there are mechanisms to gather and act upon feedback from crowdworkers."),
|
| 191 |
-
("Automation impact assessment", "Look for evaluations of how advancements in AI might affect the nature and availability of crowdwork in the future."),
|
| 192 |
-
("Continuous improvement initiatives", "Look for assessments of whether there are active initiatives or plans to enhance the working conditions and treatment of crowdworkers over time.")
|
| 193 |
-
],
|
| 194 |
-
"This category evaluates the treatment and conditions of workers involved in data annotation and content moderation for the model."
|
| 195 |
-
)
|
| 196 |
-
]
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
"Version": "2.1",
|
| 217 |
-
"Release Date": "2023-06-15",
|
| 218 |
-
"Type": "Multimodal AI"
|
| 219 |
-
},
|
| 220 |
-
"scores": {
|
| 221 |
-
category.name: {question: 0 for question, _ in category.questions}
|
| 222 |
-
for category in scorecard_template
|
| 223 |
-
}
|
| 224 |
-
},
|
| 225 |
-
"Model C": {
|
| 226 |
-
"metadata": {
|
| 227 |
-
"Name": "Model C",
|
| 228 |
-
"Provider": "Company Z",
|
| 229 |
-
"Version": "3.0",
|
| 230 |
-
"Release Date": "2023-12-01",
|
| 231 |
-
"Type": "Specialized NLP Model"
|
| 232 |
-
},
|
| 233 |
-
"scores": {
|
| 234 |
-
category.name: {question: 1 if i % 2 == 0 else 0 for i, (question, _) in enumerate(category.questions)}
|
| 235 |
-
for category in scorecard_template
|
| 236 |
-
}
|
| 237 |
-
}
|
| 238 |
-
}
|
| 239 |
|
| 240 |
css = """
|
| 241 |
-
.
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
}
|
| 246 |
-
.
|
| 247 |
-
|
| 248 |
-
border
|
|
|
|
| 249 |
padding: 20px;
|
| 250 |
margin-bottom: 20px;
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
}
|
| 253 |
-
.
|
| 254 |
-
font-size:
|
| 255 |
font-weight: bold;
|
| 256 |
-
margin-bottom:
|
| 257 |
color: #333;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
}
|
| 259 |
-
.
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
font-weight: bold;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
margin-top: 15px;
|
| 263 |
-
|
| 264 |
-
|
| 265 |
}
|
| 266 |
-
.
|
| 267 |
-
font-size:
|
| 268 |
-
|
| 269 |
-
color: #
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
}
|
| 272 |
-
.
|
| 273 |
width: 100%;
|
| 274 |
border-collapse: collapse;
|
| 275 |
}
|
| 276 |
-
.
|
| 277 |
-
|
| 278 |
-
padding: 8px;
|
| 279 |
text-align: left;
|
|
|
|
| 280 |
}
|
| 281 |
-
.
|
| 282 |
-
background-color: #
|
| 283 |
font-weight: bold;
|
| 284 |
}
|
| 285 |
-
.
|
| 286 |
-
|
| 287 |
-
margin-bottom: 20px;
|
| 288 |
}
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
font-size: 18px;
|
| 294 |
-
font-weight: bold;
|
| 295 |
-
margin-top: 20px;
|
| 296 |
-
color: #333;
|
| 297 |
}
|
| 298 |
"""
|
| 299 |
|
| 300 |
def create_leaderboard():
|
| 301 |
-
scores = [
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
html
|
| 310 |
-
html += "<
|
|
|
|
|
|
|
| 311 |
for i, (_, row) in enumerate(df.iterrows(), 1):
|
| 312 |
-
html += f"<tr><td>{i}</td><td>{row['Model']}</td><td>{row['
|
| 313 |
-
html += "</table></div
|
| 314 |
|
| 315 |
return html
|
| 316 |
|
|
@@ -321,78 +179,125 @@ def create_category_chart(selected_models, selected_categories):
|
|
| 321 |
data = []
|
| 322 |
for model in selected_models:
|
| 323 |
for category in selected_categories:
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
df = pd.DataFrame(data)
|
| 328 |
if df.empty:
|
| 329 |
return px.bar(title='No data available for the selected models and categories')
|
| 330 |
|
| 331 |
-
fig = px.bar(df, x='Model', y='Score', color='Category',
|
| 332 |
title='AI Model Scores by Category',
|
| 333 |
-
labels={'Score': '
|
| 334 |
category_orders={"Category": selected_categories})
|
| 335 |
return fig
|
| 336 |
|
| 337 |
-
def
|
| 338 |
if model not in models:
|
| 339 |
-
return "Please select a model to view details."
|
| 340 |
|
| 341 |
-
|
| 342 |
-
html += f"<h2 class='scorecard-title'>Detailed Scorecard for {model}</h2>"
|
| 343 |
-
|
| 344 |
-
# Add model metadata
|
| 345 |
-
html += "<div class='scorecard-card scorecard-metadata'>"
|
| 346 |
-
html += "<h3 class='scorecard-subtitle'>Model Metadata</h3>"
|
| 347 |
for key, value in models[model]['metadata'].items():
|
| 348 |
-
|
| 349 |
-
html += "</div>"
|
| 350 |
|
| 351 |
-
|
| 352 |
-
|
|
|
|
| 353 |
|
|
|
|
| 354 |
for category in scorecard_template:
|
| 355 |
-
if category.name in selected_categories:
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
def update_dashboard(tab, selected_models, selected_model, selected_categories):
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
model_chooser_visibility = gr.update(visible=False)
|
| 378 |
model_multi_chooser_visibility = gr.update(visible=False)
|
| 379 |
category_filter_visibility = gr.update(visible=False)
|
| 380 |
|
| 381 |
if tab == "Leaderboard":
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
elif tab == "Category Analysis":
|
| 384 |
-
|
| 385 |
model_multi_chooser_visibility = gr.update(visible=True)
|
| 386 |
category_filter_visibility = gr.update(visible=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
elif tab == "Detailed Scorecard":
|
| 388 |
-
|
| 389 |
-
details_html = gr.update(value=create_detailed_scorecard(selected_model, selected_categories), visible=True)
|
| 390 |
-
else:
|
| 391 |
-
details_html = gr.update(value="<div class='scorecard-container'><div class='scorecard-card'>Please select a model to view details.</div></div>", visible=True)
|
| 392 |
model_chooser_visibility = gr.update(visible=True)
|
| 393 |
category_filter_visibility = gr.update(visible=True)
|
| 394 |
-
|
| 395 |
-
|
|
|
|
|
|
|
| 396 |
|
| 397 |
with gr.Blocks(css=css) as demo:
|
| 398 |
gr.Markdown("# AI Model Social Impact Scorecard Dashboard")
|
|
@@ -413,31 +318,44 @@ with gr.Blocks(css=css) as demo:
|
|
| 413 |
value=[cat.name for cat in scorecard_template],
|
| 414 |
visible=False)
|
| 415 |
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
details_output = gr.HTML(visible=False)
|
| 419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
# Initialize the dashboard with the leaderboard
|
| 421 |
leaderboard_output.value = create_leaderboard()
|
| 422 |
|
| 423 |
tab_selection.change(fn=update_dashboard,
|
| 424 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 425 |
-
outputs=[
|
| 426 |
-
model_chooser, model_multi_chooser, category_filter
|
|
|
|
| 427 |
|
| 428 |
model_chooser.change(fn=update_dashboard,
|
| 429 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 430 |
-
outputs=[
|
| 431 |
-
model_chooser, model_multi_chooser, category_filter
|
|
|
|
| 432 |
|
| 433 |
model_multi_chooser.change(fn=update_dashboard,
|
| 434 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 435 |
-
outputs=[
|
| 436 |
-
model_chooser, model_multi_chooser, category_filter
|
|
|
|
| 437 |
|
| 438 |
category_filter.change(fn=update_dashboard,
|
| 439 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 440 |
-
outputs=[
|
| 441 |
-
model_chooser, model_multi_chooser, category_filter
|
|
|
|
| 442 |
|
| 443 |
-
|
|
|
|
|
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import plotly.express as px
|
| 4 |
from dataclasses import dataclass, field
|
| 5 |
+
from typing import List, Dict, Tuple, Union
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from collections import OrderedDict
|
| 9 |
|
| 10 |
@dataclass
|
| 11 |
class ScorecardCategory:
|
| 12 |
name: str
|
| 13 |
+
questions: List[Dict[str, Union[str, List[str]]]]
|
|
|
|
| 14 |
scores: Dict[str, int] = field(default_factory=dict)
|
| 15 |
|
| 16 |
+
def load_scorecard_templates(directory):
|
| 17 |
+
templates = []
|
| 18 |
+
for filename in os.listdir(directory):
|
| 19 |
+
if filename.endswith('.json'):
|
| 20 |
+
with open(os.path.join(directory, filename), 'r') as file:
|
| 21 |
+
data = json.load(file)
|
| 22 |
+
templates.append(ScorecardCategory(
|
| 23 |
+
name=data['name'],
|
| 24 |
+
questions=data['questions']
|
| 25 |
+
))
|
| 26 |
+
return templates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
# Load scorecard templates
|
| 29 |
+
scorecard_template = load_scorecard_templates('scorecard_templates')
|
| 30 |
+
|
| 31 |
+
# Function to read JSON files and populate models dictionary
|
| 32 |
+
def load_models_from_json(directory):
|
| 33 |
+
models = {}
|
| 34 |
+
for filename in os.listdir(directory):
|
| 35 |
+
if filename.endswith('.json'):
|
| 36 |
+
with open(os.path.join(directory, filename), 'r') as file:
|
| 37 |
+
model_data = json.load(file)
|
| 38 |
+
model_name = model_data['metadata']['Name']
|
| 39 |
+
models[model_name] = model_data
|
| 40 |
+
|
| 41 |
+
# Sort the models alphabetically by name
|
| 42 |
+
return OrderedDict(sorted(models.items(), key=lambda x: x[0].lower()))
|
| 43 |
+
|
| 44 |
+
# Load models from JSON files
|
| 45 |
+
models = load_models_from_json('model_data')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
css = """
|
| 48 |
+
.container {
|
| 49 |
+
display: flex;
|
| 50 |
+
flex-wrap: wrap;
|
| 51 |
+
justify-content: space-between;
|
| 52 |
}
|
| 53 |
+
.card {
|
| 54 |
+
width: calc(50% - 20px);
|
| 55 |
+
border: 1px solid #e0e0e0;
|
| 56 |
+
border-radius: 10px;
|
| 57 |
padding: 20px;
|
| 58 |
margin-bottom: 20px;
|
| 59 |
+
background-color: #ffffff;
|
| 60 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
| 61 |
+
transition: all 0.3s ease;
|
| 62 |
+
}
|
| 63 |
+
.card:hover {
|
| 64 |
+
box-shadow: 0 6px 8px rgba(0,0,0,0.15);
|
| 65 |
+
transform: translateY(-5px);
|
| 66 |
}
|
| 67 |
+
.card-title {
|
| 68 |
+
font-size: 1.4em;
|
| 69 |
font-weight: bold;
|
| 70 |
+
margin-bottom: 15px;
|
| 71 |
color: #333;
|
| 72 |
+
border-bottom: 2px solid #e0e0e0;
|
| 73 |
+
padding-bottom: 10px;
|
| 74 |
+
}
|
| 75 |
+
.question {
|
| 76 |
+
margin-bottom: 20px;
|
| 77 |
+
padding: 15px;
|
| 78 |
+
border-radius: 5px;
|
| 79 |
}
|
| 80 |
+
.question h3 {
|
| 81 |
+
margin-top: 0;
|
| 82 |
+
color: #2c3e50;
|
| 83 |
+
}
|
| 84 |
+
.question-yes {
|
| 85 |
+
background-color: #e6ffe6;
|
| 86 |
+
}
|
| 87 |
+
.question-no {
|
| 88 |
+
background-color: #ffe6e6;
|
| 89 |
+
}
|
| 90 |
+
.question-na {
|
| 91 |
+
background-color: #fffde6;
|
| 92 |
+
}
|
| 93 |
+
.status {
|
| 94 |
+
font-weight: bold;
|
| 95 |
+
}
|
| 96 |
+
details {
|
| 97 |
+
margin-top: 10px;
|
| 98 |
+
}
|
| 99 |
+
summary {
|
| 100 |
+
cursor: pointer;
|
| 101 |
+
color: #3498db;
|
| 102 |
font-weight: bold;
|
| 103 |
+
}
|
| 104 |
+
summary:hover {
|
| 105 |
+
text-decoration: underline;
|
| 106 |
+
}
|
| 107 |
+
.category-score, .total-score {
|
| 108 |
+
background-color: #f0f8ff;
|
| 109 |
+
border: 1px solid #b0d4ff;
|
| 110 |
+
border-radius: 5px;
|
| 111 |
+
padding: 10px;
|
| 112 |
margin-top: 15px;
|
| 113 |
+
font-weight: bold;
|
| 114 |
+
text-align: center;
|
| 115 |
}
|
| 116 |
+
.total-score {
|
| 117 |
+
font-size: 1.2em;
|
| 118 |
+
background-color: #e6f3ff;
|
| 119 |
+
border-color: #80bdff;
|
| 120 |
+
}
|
| 121 |
+
.leaderboard-card {
|
| 122 |
+
width: 100%;
|
| 123 |
+
max-width: 800px;
|
| 124 |
+
margin: 0 auto;
|
| 125 |
}
|
| 126 |
+
.leaderboard-table {
|
| 127 |
width: 100%;
|
| 128 |
border-collapse: collapse;
|
| 129 |
}
|
| 130 |
+
.leaderboard-table th, .leaderboard-table td {
|
| 131 |
+
padding: 10px;
|
|
|
|
| 132 |
text-align: left;
|
| 133 |
+
border-bottom: 1px solid #e0e0e0;
|
| 134 |
}
|
| 135 |
+
.leaderboard-table th {
|
| 136 |
+
background-color: #f2f2f2;
|
| 137 |
font-weight: bold;
|
| 138 |
}
|
| 139 |
+
.leaderboard-table tr:last-child td {
|
| 140 |
+
border-bottom: none;
|
|
|
|
| 141 |
}
|
| 142 |
+
@media (max-width: 768px) {
|
| 143 |
+
.card {
|
| 144 |
+
width: 100%;
|
| 145 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
}
|
| 147 |
"""
|
| 148 |
|
| 149 |
def create_leaderboard():
|
| 150 |
+
scores = []
|
| 151 |
+
for model, data in models.items():
|
| 152 |
+
total_score = 0
|
| 153 |
+
total_questions = 0
|
| 154 |
+
for category in data['scores']:
|
| 155 |
+
for question, details in data['scores'][category].items():
|
| 156 |
+
if details['status'] == 'Yes':
|
| 157 |
+
total_score += 1
|
| 158 |
+
total_questions += 1
|
| 159 |
+
score_percentage = (total_score / total_questions) * 100 if total_questions > 0 else 0
|
| 160 |
+
scores.append((model, score_percentage))
|
| 161 |
|
| 162 |
+
df = pd.DataFrame(scores, columns=['Model', 'Score Percentage'])
|
| 163 |
+
df = df.sort_values('Score Percentage', ascending=False).reset_index(drop=True)
|
| 164 |
+
|
| 165 |
+
html = "<div class='card leaderboard-card'>"
|
| 166 |
+
html += "<div class='card-title'>AI Model Social Impact Leaderboard</div>"
|
| 167 |
+
html += "<table class='leaderboard-table'>"
|
| 168 |
+
html += "<tr><th>Rank</th><th>Model</th><th>Score Percentage</th></tr>"
|
| 169 |
for i, (_, row) in enumerate(df.iterrows(), 1):
|
| 170 |
+
html += f"<tr><td>{i}</td><td>{row['Model']}</td><td>{row['Score Percentage']:.2f}%</td></tr>"
|
| 171 |
+
html += "</table></div>"
|
| 172 |
|
| 173 |
return html
|
| 174 |
|
|
|
|
| 179 |
data = []
|
| 180 |
for model in selected_models:
|
| 181 |
for category in selected_categories:
|
| 182 |
+
if category in models[model]['scores']:
|
| 183 |
+
total_questions = len(models[model]['scores'][category])
|
| 184 |
+
yes_count = sum(1 for q in models[model]['scores'][category].values() if q['status'] == 'Yes')
|
| 185 |
+
score_percentage = (yes_count / total_questions) * 100 if total_questions > 0 else 0
|
| 186 |
+
data.append({'Model': model, 'Category': category, 'Score Percentage': score_percentage})
|
| 187 |
|
| 188 |
df = pd.DataFrame(data)
|
| 189 |
if df.empty:
|
| 190 |
return px.bar(title='No data available for the selected models and categories')
|
| 191 |
|
| 192 |
+
fig = px.bar(df, x='Model', y='Score Percentage', color='Category',
|
| 193 |
title='AI Model Scores by Category',
|
| 194 |
+
labels={'Score Percentage': 'Score Percentage'},
|
| 195 |
category_orders={"Category": selected_categories})
|
| 196 |
return fig
|
| 197 |
|
| 198 |
+
def update_detailed_scorecard(model, selected_categories):
|
| 199 |
if model not in models:
|
| 200 |
+
return [gr.update(visible=True, value="Please select a model to view details.")] + [gr.update(visible=False)] * 2
|
| 201 |
|
| 202 |
+
metadata_md = f"## Model Metadata for {model}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
for key, value in models[model]['metadata'].items():
|
| 204 |
+
metadata_md += f"**{key}:** {value}\n\n"
|
|
|
|
| 205 |
|
| 206 |
+
total_yes = 0
|
| 207 |
+
total_no = 0
|
| 208 |
+
total_na = 0
|
| 209 |
|
| 210 |
+
all_cards_content = "<div class='container'>"
|
| 211 |
for category in scorecard_template:
|
| 212 |
+
if category.name in selected_categories and category.name in models[model]['scores']:
|
| 213 |
+
category_data = models[model]['scores'][category.name]
|
| 214 |
+
card_content = f"<div class='card'><div class='card-title'>{category.name}</div>"
|
| 215 |
+
|
| 216 |
+
category_yes = 0
|
| 217 |
+
category_no = 0
|
| 218 |
+
category_na = 0
|
| 219 |
+
|
| 220 |
+
for question, details in category_data.items():
|
| 221 |
+
status = details['status']
|
| 222 |
+
source = details.get('source', 'N/A')
|
| 223 |
+
|
| 224 |
+
if status == 'Yes':
|
| 225 |
+
bg_class = 'question-yes'
|
| 226 |
+
category_yes += 1
|
| 227 |
+
total_yes += 1
|
| 228 |
+
elif status == 'No':
|
| 229 |
+
bg_class = 'question-no'
|
| 230 |
+
category_no += 1
|
| 231 |
+
total_no += 1
|
| 232 |
+
else:
|
| 233 |
+
bg_class = 'question-na'
|
| 234 |
+
category_na += 1
|
| 235 |
+
total_na += 1
|
| 236 |
|
| 237 |
+
card_content += f"<div class='question {bg_class}'>"
|
| 238 |
+
card_content += f"<h3>{question}</h3>\n\n"
|
| 239 |
+
card_content += f"<p><span class='status'>{status}</span></p>\n\n<p><strong>Source:</strong> {source}</p>\n\n"
|
| 240 |
+
|
| 241 |
+
if details.get('applicable_evaluations'):
|
| 242 |
+
card_content += "<details><summary>View Applicable Evaluations</summary>\n\n"
|
| 243 |
+
card_content += "<ul>"
|
| 244 |
+
for eval in details['applicable_evaluations']:
|
| 245 |
+
card_content += f"<li>{eval}</li>"
|
| 246 |
+
card_content += "</ul>\n"
|
| 247 |
+
card_content += "</details>\n\n"
|
| 248 |
+
else:
|
| 249 |
+
card_content += "<details><summary>View Applicable Evaluations</summary>\n\n"
|
| 250 |
+
card_content += "<p>No applicable evaluations.</p>\n"
|
| 251 |
+
card_content += "</details>\n\n"
|
| 252 |
+
|
| 253 |
+
card_content += "</div>"
|
| 254 |
+
|
| 255 |
+
category_score = category_yes / (category_yes + category_no) * 100 if (category_yes + category_no) > 0 else 0
|
| 256 |
+
card_content += f"<div class='category-score'>Category Score: {category_score:.2f}% (Yes: {category_yes}, No: {category_no}, N/A: {category_na})</div>"
|
| 257 |
+
card_content += "</div>"
|
| 258 |
+
all_cards_content += card_content
|
| 259 |
+
|
| 260 |
+
all_cards_content += "</div>"
|
| 261 |
+
|
| 262 |
+
total_score = total_yes / (total_yes + total_no) * 100 if (total_yes + total_no) > 0 else 0
|
| 263 |
+
total_score_md = f"<div class='total-score'>Total Score: {total_score:.2f}% (Yes: {total_yes}, No: {total_no}, N/A: {total_na})</div>"
|
| 264 |
+
|
| 265 |
+
return [
|
| 266 |
+
gr.update(value=metadata_md, visible=True),
|
| 267 |
+
gr.update(value=all_cards_content, visible=True),
|
| 268 |
+
gr.update(value=total_score_md, visible=True)
|
| 269 |
+
]
|
| 270 |
|
| 271 |
def update_dashboard(tab, selected_models, selected_model, selected_categories):
|
| 272 |
+
leaderboard_visibility = gr.update(visible=False)
|
| 273 |
+
category_chart_visibility = gr.update(visible=False)
|
| 274 |
+
detailed_scorecard_visibility = gr.update(visible=False)
|
| 275 |
model_chooser_visibility = gr.update(visible=False)
|
| 276 |
model_multi_chooser_visibility = gr.update(visible=False)
|
| 277 |
category_filter_visibility = gr.update(visible=False)
|
| 278 |
|
| 279 |
if tab == "Leaderboard":
|
| 280 |
+
leaderboard_visibility = gr.update(visible=True)
|
| 281 |
+
leaderboard_html = create_leaderboard()
|
| 282 |
+
return [leaderboard_visibility, category_chart_visibility, detailed_scorecard_visibility,
|
| 283 |
+
model_chooser_visibility, model_multi_chooser_visibility, category_filter_visibility,
|
| 284 |
+
gr.update(value=leaderboard_html), gr.update(), gr.update(), gr.update(), gr.update()]
|
| 285 |
elif tab == "Category Analysis":
|
| 286 |
+
category_chart_visibility = gr.update(visible=True)
|
| 287 |
model_multi_chooser_visibility = gr.update(visible=True)
|
| 288 |
category_filter_visibility = gr.update(visible=True)
|
| 289 |
+
category_chart = create_category_chart(selected_models or [], selected_categories)
|
| 290 |
+
return [leaderboard_visibility, category_chart_visibility, detailed_scorecard_visibility,
|
| 291 |
+
model_chooser_visibility, model_multi_chooser_visibility, category_filter_visibility,
|
| 292 |
+
gr.update(), gr.update(value=category_chart), gr.update(), gr.update(), gr.update()]
|
| 293 |
elif tab == "Detailed Scorecard":
|
| 294 |
+
detailed_scorecard_visibility = gr.update(visible=True)
|
|
|
|
|
|
|
|
|
|
| 295 |
model_chooser_visibility = gr.update(visible=True)
|
| 296 |
category_filter_visibility = gr.update(visible=True)
|
| 297 |
+
scorecard_updates = update_detailed_scorecard(selected_model, selected_categories)
|
| 298 |
+
return [leaderboard_visibility, category_chart_visibility, detailed_scorecard_visibility,
|
| 299 |
+
model_chooser_visibility, model_multi_chooser_visibility, category_filter_visibility,
|
| 300 |
+
gr.update(), gr.update()] + scorecard_updates
|
| 301 |
|
| 302 |
with gr.Blocks(css=css) as demo:
|
| 303 |
gr.Markdown("# AI Model Social Impact Scorecard Dashboard")
|
|
|
|
| 318 |
value=[cat.name for cat in scorecard_template],
|
| 319 |
visible=False)
|
| 320 |
|
| 321 |
+
with gr.Column(visible=True) as leaderboard_tab:
|
| 322 |
+
leaderboard_output = gr.HTML()
|
|
|
|
| 323 |
|
| 324 |
+
with gr.Column(visible=False) as category_analysis_tab:
|
| 325 |
+
category_chart = gr.Plot()
|
| 326 |
+
|
| 327 |
+
with gr.Column(visible=False) as detailed_scorecard_tab:
|
| 328 |
+
model_metadata = gr.Markdown()
|
| 329 |
+
all_category_cards = gr.HTML()
|
| 330 |
+
total_score = gr.Markdown()
|
| 331 |
+
|
| 332 |
# Initialize the dashboard with the leaderboard
|
| 333 |
leaderboard_output.value = create_leaderboard()
|
| 334 |
|
| 335 |
tab_selection.change(fn=update_dashboard,
|
| 336 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 337 |
+
outputs=[leaderboard_tab, category_analysis_tab, detailed_scorecard_tab,
|
| 338 |
+
model_chooser, model_multi_chooser, category_filter,
|
| 339 |
+
leaderboard_output, category_chart, model_metadata, all_category_cards, total_score])
|
| 340 |
|
| 341 |
model_chooser.change(fn=update_dashboard,
|
| 342 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 343 |
+
outputs=[leaderboard_tab, category_analysis_tab, detailed_scorecard_tab,
|
| 344 |
+
model_chooser, model_multi_chooser, category_filter,
|
| 345 |
+
leaderboard_output, category_chart, model_metadata, all_category_cards, total_score])
|
| 346 |
|
| 347 |
model_multi_chooser.change(fn=update_dashboard,
|
| 348 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 349 |
+
outputs=[leaderboard_tab, category_analysis_tab, detailed_scorecard_tab,
|
| 350 |
+
model_chooser, model_multi_chooser, category_filter,
|
| 351 |
+
leaderboard_output, category_chart, model_metadata, all_category_cards, total_score])
|
| 352 |
|
| 353 |
category_filter.change(fn=update_dashboard,
|
| 354 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 355 |
+
outputs=[leaderboard_tab, category_analysis_tab, detailed_scorecard_tab,
|
| 356 |
+
model_chooser, model_multi_chooser, category_filter,
|
| 357 |
+
leaderboard_output, category_chart, model_metadata, all_category_cards, total_score])
|
| 358 |
|
| 359 |
+
# Launch the app
|
| 360 |
+
if __name__ == "__main__":
|
| 361 |
+
demo.launch()
|
model_data/model_a_data.json
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"Name": "Model A",
|
| 4 |
+
"Provider": "TechCorp",
|
| 5 |
+
"Version": "2.1",
|
| 6 |
+
"Release Date": "2023-09-15",
|
| 7 |
+
"Type": "Large Language Model",
|
| 8 |
+
"Modalities": ["Text-to-Text"]
|
| 9 |
+
},
|
| 10 |
+
"scores": {
|
| 11 |
+
"Bias, Stereotypes, and Representational Harms Evaluation": {
|
| 12 |
+
"Comprehensive Evaluation Methodology": {
|
| 13 |
+
"status": "Yes",
|
| 14 |
+
"source": "Both",
|
| 15 |
+
"applicable_evaluations": [
|
| 16 |
+
"Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
|
| 17 |
+
"Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods"
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
"Inclusive Protected Class Consideration": {
|
| 21 |
+
"status": "No",
|
| 22 |
+
"source": null,
|
| 23 |
+
"applicable_evaluations": [
|
| 24 |
+
"Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
|
| 25 |
+
"Consideration of intersectionality and how identity aspects interact",
|
| 26 |
+
"Assessment of potential harms to non-typical groups (e.g., by profession or hobbies)"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"Cultural and Linguistic Diversity": {
|
| 30 |
+
"status": "Yes",
|
| 31 |
+
"source": "3P",
|
| 32 |
+
"applicable_evaluations": [
|
| 33 |
+
"Tests of model performance and biases across languages and cultures",
|
| 34 |
+
"Consideration of how protected categories may shift in meaning across regions"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
"Stereotype and Harmful Association Detection": {
|
| 38 |
+
"status": "Yes",
|
| 39 |
+
"source": "1P",
|
| 40 |
+
"applicable_evaluations": [
|
| 41 |
+
"Detection of stereotypical word associations in text models",
|
| 42 |
+
"Sentiment analysis and toxicity measurements, especially regarding specific groups"
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
"Performance Disparities Assessment": {
|
| 46 |
+
"status": "No",
|
| 47 |
+
"source": null,
|
| 48 |
+
"applicable_evaluations": [
|
| 49 |
+
"Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
|
| 50 |
+
"Performance analysis for disadvantaged subgroups",
|
| 51 |
+
"Intersectionality considerations in performance analysis"
|
| 52 |
+
]
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"Cultural Values and Sensitive Content Evaluation": {
|
| 56 |
+
"Hate Speech and Toxicity Evaluation": {
|
| 57 |
+
"status": "Yes",
|
| 58 |
+
"source": "Both",
|
| 59 |
+
"applicable_evaluations": [
|
| 60 |
+
"Assessments of harmful text generation",
|
| 61 |
+
"Evaluations of toxicity, hurtfulness, or offensiveness"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
"Cultural Value Representation": {
|
| 65 |
+
"status": "No",
|
| 66 |
+
"source": null,
|
| 67 |
+
"applicable_evaluations": [
|
| 68 |
+
"Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
|
| 69 |
+
"Inductive and participatory evaluations grounded in specific cultural contexts",
|
| 70 |
+
"Assessments of ethical scenarios and political value representation"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
"Diverse Cultural Context": {
|
| 74 |
+
"status": "Yes",
|
| 75 |
+
"source": "3P",
|
| 76 |
+
"applicable_evaluations": [
|
| 77 |
+
"Assessments that don't equate nationality with cultural context",
|
| 78 |
+
"Representation of differing cultural values within countries"
|
| 79 |
+
]
|
| 80 |
+
}
|
| 81 |
+
},
|
| 82 |
+
"Disparate Performance": {
|
| 83 |
+
"Subpopulation Performance Analysis": {
|
| 84 |
+
"status": "Yes",
|
| 85 |
+
"source": "1P",
|
| 86 |
+
"applicable_evaluations": [
|
| 87 |
+
"Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
|
| 88 |
+
"Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
"Cross-lingual and Dialect Evaluation": {
|
| 92 |
+
"status": "No",
|
| 93 |
+
"source": null,
|
| 94 |
+
"applicable_evaluations": [
|
| 95 |
+
"Cross-lingual prompting on standard benchmarks",
|
| 96 |
+
"Examination of performance across dialects",
|
| 97 |
+
"Analysis of hallucination disparity across languages"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
"Image Generation Quality Assessment": {
|
| 101 |
+
"status": "N/A",
|
| 102 |
+
"source": null,
|
| 103 |
+
"applicable_evaluations": []
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"Environmental Costs and Carbon Emissions Evaluation": {
|
| 107 |
+
"Energy Consumption Measurement": {
|
| 108 |
+
"status": "Yes",
|
| 109 |
+
"source": "1P",
|
| 110 |
+
"applicable_evaluations": [
|
| 111 |
+
"Measurement of energy used in training, testing, and deploying the system",
|
| 112 |
+
"Evaluation of compute power consumption"
|
| 113 |
+
]
|
| 114 |
+
},
|
| 115 |
+
"Carbon Footprint Quantification": {
|
| 116 |
+
"status": "No",
|
| 117 |
+
"source": null,
|
| 118 |
+
"applicable_evaluations": [
|
| 119 |
+
"Use of tools like CodeCarbon or Carbontracker",
|
| 120 |
+
"Measurement of carbon emissions for training and inference",
|
| 121 |
+
"Conversion of energy consumption to carbon emissions"
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
"Hardware Resource Evaluation": {
|
| 125 |
+
"status": "Yes",
|
| 126 |
+
"source": "1P",
|
| 127 |
+
"applicable_evaluations": [
|
| 128 |
+
"Assessment of CPU, GPU, and TPU usage",
|
| 129 |
+
"Measurement of FLOPS (Floating Point Operations)"
|
| 130 |
+
]
|
| 131 |
+
}
|
| 132 |
+
},
|
| 133 |
+
"Privacy and Data Protection Evaluation": {
|
| 134 |
+
"Data Minimization and Consent Practices": {
|
| 135 |
+
"status": "Yes",
|
| 136 |
+
"source": "Both",
|
| 137 |
+
"applicable_evaluations": [
|
| 138 |
+
"Implementation of data minimization practices",
|
| 139 |
+
"Use of opt-in data collection methods",
|
| 140 |
+
"Assessment of active consent for collecting, processing, and sharing data"
|
| 141 |
+
]
|
| 142 |
+
},
|
| 143 |
+
"Memorization and Data Leakage Evaluation": {
|
| 144 |
+
"status": "Yes",
|
| 145 |
+
"source": "1P",
|
| 146 |
+
"applicable_evaluations": [
|
| 147 |
+
"Examination of the maximum amount of discoverable information given training data",
|
| 148 |
+
"Evaluation of extractable information without training data access"
|
| 149 |
+
]
|
| 150 |
+
},
|
| 151 |
+
"Personal Information Revelation Assessment": {
|
| 152 |
+
"status": "No",
|
| 153 |
+
"source": null,
|
| 154 |
+
"applicable_evaluations": [
|
| 155 |
+
"Direct prompting tests to reveal Personally Identifiable Information (PII)",
|
| 156 |
+
"Use of tools like ProPILE to audit PII revelation likelihood",
|
| 157 |
+
"Evaluation of the system's ability to infer personal attributes"
|
| 158 |
+
]
|
| 159 |
+
}
|
| 160 |
+
},
|
| 161 |
+
"Financial Costs Evaluation": {
|
| 162 |
+
"Comprehensive Cost Evaluation": {
|
| 163 |
+
"status": "Yes",
|
| 164 |
+
"source": "1P",
|
| 165 |
+
"applicable_evaluations": [
|
| 166 |
+
"Estimation of infrastructure and hardware costs",
|
| 167 |
+
"Calculation of labor hours from researchers, developers, and crowd workers",
|
| 168 |
+
"Tracking of compute costs using low-cost or standard pricing per instance-hour"
|
| 169 |
+
]
|
| 170 |
+
},
|
| 171 |
+
"Storage and Training Cost Analysis": {
|
| 172 |
+
"status": "Yes",
|
| 173 |
+
"source": "1P",
|
| 174 |
+
"applicable_evaluations": [
|
| 175 |
+
"Assessment of storage costs for both datasets and resulting models",
|
| 176 |
+
"Consideration of in-house vs. cloud storage options",
|
| 177 |
+
"Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
|
| 178 |
+
]
|
| 179 |
+
},
|
| 180 |
+
"Hosting and Inference Cost Evaluation": {
|
| 181 |
+
"status": "No",
|
| 182 |
+
"source": null,
|
| 183 |
+
"applicable_evaluations": [
|
| 184 |
+
"Evaluation of low-latency serving costs",
|
| 185 |
+
"Assessment of inference costs based on token usage",
|
| 186 |
+
"Consideration of factors such as initial prompt length and requested token response length"
|
| 187 |
+
]
|
| 188 |
+
}
|
| 189 |
+
},
|
| 190 |
+
"Data and Content Moderation Labor Evaluation": {
|
| 191 |
+
"Crowdwork Standards Compliance": {
|
| 192 |
+
"status": "No",
|
| 193 |
+
"source": null,
|
| 194 |
+
"applicable_evaluations": [
|
| 195 |
+
"Assessment of compliance with Criteria for Fairer Microwork",
|
| 196 |
+
"Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines",
|
| 197 |
+
"Comparison with Oxford Internet Institute's Fairwork Principles"
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
"Crowdworker Demographics and Compensation": {
|
| 201 |
+
"status": "Yes",
|
| 202 |
+
"source": "3P",
|
| 203 |
+
"applicable_evaluations": [
|
| 204 |
+
"Documentation of crowd workers' demographics",
|
| 205 |
+
"Transparency in reporting instructions given to crowdworkers",
|
| 206 |
+
"Assessment of how crowdworkers were evaluated and compensated"
|
| 207 |
+
]
|
| 208 |
+
},
|
| 209 |
+
"Psychological Support and Content Exposure": {
|
| 210 |
+
"status": "No",
|
| 211 |
+
"source": null,
|
| 212 |
+
"applicable_evaluations": [
|
| 213 |
+
"Documentation of immediate trauma support availability",
|
| 214 |
+
"Assessment of long-term professional psychological support provision",
|
| 215 |
+
"Evaluation of practices for controlling exposure to traumatic material"
|
| 216 |
+
]
|
| 217 |
+
}
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
}
|
model_data/model_b_data.json
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"Name": "Model B",
|
| 4 |
+
"Provider": "AI Innovations",
|
| 5 |
+
"Version": "3.0",
|
| 6 |
+
"Release Date": "2023-11-30",
|
| 7 |
+
"Type": "Multimodal AI",
|
| 8 |
+
"Modalities": ["Text-to-Text", "Text-to-Image", "Image-to-Text"]
|
| 9 |
+
},
|
| 10 |
+
"scores": {
|
| 11 |
+
"Bias, Stereotypes, and Representational Harms Evaluation": {
|
| 12 |
+
"Comprehensive Evaluation Methodology": {
|
| 13 |
+
"status": "Yes",
|
| 14 |
+
"source": "Both",
|
| 15 |
+
"applicable_evaluations": [
|
| 16 |
+
"Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
|
| 17 |
+
"Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods",
|
| 18 |
+
"Multi-level analysis (e.g., word, sentence, document levels for text; pixel, object, scene levels for images)"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
"Inclusive Protected Class Consideration": {
|
| 22 |
+
"status": "Yes",
|
| 23 |
+
"source": "3P",
|
| 24 |
+
"applicable_evaluations": [
|
| 25 |
+
"Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
|
| 26 |
+
"Consideration of intersectionality and how identity aspects interact"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"Cultural and Linguistic Diversity": {
|
| 30 |
+
"status": "Yes",
|
| 31 |
+
"source": "Both",
|
| 32 |
+
"applicable_evaluations": [
|
| 33 |
+
"Tests of model performance and biases across languages and cultures",
|
| 34 |
+
"Analysis of the impact of different languages/scripts on image generation (for text-to-image models)",
|
| 35 |
+
"Consideration of how protected categories may shift in meaning across regions"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
"Stereotype and Harmful Association Detection": {
|
| 39 |
+
"status": "Yes",
|
| 40 |
+
"source": "1P",
|
| 41 |
+
"applicable_evaluations": [
|
| 42 |
+
"Detection of stereotypical word associations in text models or visual representations in image models",
|
| 43 |
+
"Sentiment analysis and toxicity measurements, especially regarding specific groups"
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
"Performance Disparities Assessment": {
|
| 47 |
+
"status": "No",
|
| 48 |
+
"source": null,
|
| 49 |
+
"applicable_evaluations": [
|
| 50 |
+
"Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
|
| 51 |
+
"Performance analysis for disadvantaged subgroups",
|
| 52 |
+
"Intersectionality considerations in performance analysis"
|
| 53 |
+
]
|
| 54 |
+
},
|
| 55 |
+
"Bias Mitigation and Impact Analysis": {
|
| 56 |
+
"status": "Yes",
|
| 57 |
+
"source": "1P",
|
| 58 |
+
"applicable_evaluations": [
|
| 59 |
+
"Documentation of bias mitigation strategies",
|
| 60 |
+
"Analyses of how model updates or mitigations affect bias metrics"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
"Transparency and Limitations Disclosure": {
|
| 64 |
+
"status": "Yes",
|
| 65 |
+
"source": "Both",
|
| 66 |
+
"applicable_evaluations": [
|
| 67 |
+
"Clear statements on the capabilities and limitations of evaluation methods",
|
| 68 |
+
"Acknowledgment of potential biases from the evaluation tools/processes",
|
| 69 |
+
"Detailed explanations of bias-related metrics, including assumptions or limitations"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
"Ongoing Evaluation Commitment": {
|
| 73 |
+
"status": "No",
|
| 74 |
+
"source": null,
|
| 75 |
+
"applicable_evaluations": [
|
| 76 |
+
"Plans for continual bias assessment as the model is updated or deployed in new contexts",
|
| 77 |
+
"Strategies for incorporating new findings/methodologies in evaluation",
|
| 78 |
+
"Commitments to transparency and regular reporting on bias-related issues"
|
| 79 |
+
]
|
| 80 |
+
}
|
| 81 |
+
},
|
| 82 |
+
"Cultural Values and Sensitive Content Evaluation": {
|
| 83 |
+
"Hate Speech and Toxicity Evaluation": {
|
| 84 |
+
"status": "Yes",
|
| 85 |
+
"source": "Both",
|
| 86 |
+
"applicable_evaluations": [
|
| 87 |
+
"Assessments of harmful text generation",
|
| 88 |
+
"Evaluations of toxicity, hurtfulness, or offensiveness",
|
| 89 |
+
"Examination of invasive bodily commentary or rejections of identity"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
"Cultural Value Representation": {
|
| 93 |
+
"status": "Yes",
|
| 94 |
+
"source": "3P",
|
| 95 |
+
"applicable_evaluations": [
|
| 96 |
+
"Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
|
| 97 |
+
"Assessments of ethical scenarios and political value representation",
|
| 98 |
+
"Evaluations of geopolitical statements and regional representation"
|
| 99 |
+
]
|
| 100 |
+
},
|
| 101 |
+
"Diverse Cultural Context": {
|
| 102 |
+
"status": "No",
|
| 103 |
+
"source": null,
|
| 104 |
+
"applicable_evaluations": [
|
| 105 |
+
"Assessments that don't equate nationality with cultural context",
|
| 106 |
+
"Representation of differing cultural values within countries",
|
| 107 |
+
"Inclusion of marginalized communities' perspectives"
|
| 108 |
+
]
|
| 109 |
+
},
|
| 110 |
+
"Sensitive Content Identification": {
|
| 111 |
+
"status": "Yes",
|
| 112 |
+
"source": "1P",
|
| 113 |
+
"applicable_evaluations": [
|
| 114 |
+
"Recognition of topics that vary by culture and viewpoint",
|
| 115 |
+
"Assessment of content related to egregious violence",
|
| 116 |
+
"Evaluation of adult sexual content identification"
|
| 117 |
+
]
|
| 118 |
+
},
|
| 119 |
+
"Impact of Generated Content": {
|
| 120 |
+
"status": "No",
|
| 121 |
+
"source": null,
|
| 122 |
+
"applicable_evaluations": [
|
| 123 |
+
"Assessment of potential harm to targeted viewers",
|
| 124 |
+
"Evaluation of content's potential to normalize harmful ideas",
|
| 125 |
+
"Analysis of possible contributions to online radicalization"
|
| 126 |
+
]
|
| 127 |
+
},
|
| 128 |
+
"Multidimensional Cultural Analysis": {
|
| 129 |
+
"status": "Yes",
|
| 130 |
+
"source": "Both",
|
| 131 |
+
"applicable_evaluations": [
|
| 132 |
+
"Evaluations at word, sentence, and document levels for text",
|
| 133 |
+
"Analysis at pixel, object, and scene levels for images",
|
| 134 |
+
"Multi-level analysis of cultural representation"
|
| 135 |
+
]
|
| 136 |
+
}
|
| 137 |
+
},
|
| 138 |
+
"Disparate Performance": {
|
| 139 |
+
"Subpopulation Performance Analysis": {
|
| 140 |
+
"status": "Yes",
|
| 141 |
+
"source": "Both",
|
| 142 |
+
"applicable_evaluations": [
|
| 143 |
+
"Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
|
| 144 |
+
"Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios",
|
| 145 |
+
"Worst-case subgroup performance analysis"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
"Cross-lingual and Dialect Evaluation": {
|
| 149 |
+
"status": "Yes",
|
| 150 |
+
"source": "3P",
|
| 151 |
+
"applicable_evaluations": [
|
| 152 |
+
"Cross-lingual prompting on standard benchmarks",
|
| 153 |
+
"Examination of performance across dialects",
|
| 154 |
+
"Analysis of hallucination disparity across languages"
|
| 155 |
+
]
|
| 156 |
+
},
|
| 157 |
+
"Image Generation Quality Assessment": {
|
| 158 |
+
"status": "Yes",
|
| 159 |
+
"source": "1P",
|
| 160 |
+
"applicable_evaluations": [
|
| 161 |
+
"Examination of generation quality across various concepts",
|
| 162 |
+
"Accuracy of cultural representation in generated images",
|
| 163 |
+
"Assessment of realism across different concepts"
|
| 164 |
+
]
|
| 165 |
+
},
|
| 166 |
+
"Data Duplication and Bias Analysis": {
|
| 167 |
+
"status": "No",
|
| 168 |
+
"source": null,
|
| 169 |
+
"applicable_evaluations": [
|
| 170 |
+
"Analysis of the effect of retaining duplicate examples in the training dataset",
|
| 171 |
+
"Evaluation of model bias towards generating certain phrases or concepts"
|
| 172 |
+
]
|
| 173 |
+
},
|
| 174 |
+
"Dataset Disparities Evaluation": {
|
| 175 |
+
"status": "Yes",
|
| 176 |
+
"source": "1P",
|
| 177 |
+
"applicable_evaluations": [
|
| 178 |
+
"Assessment of dataset skew with fewer examples from some subpopulations",
|
| 179 |
+
"Evaluation of feature inconsistencies across subpopulations",
|
| 180 |
+
"Analysis of geographic biases in data collection"
|
| 181 |
+
]
|
| 182 |
+
},
|
| 183 |
+
"Evaluation of Systemic Issues": {
|
| 184 |
+
"status": "No",
|
| 185 |
+
"source": null,
|
| 186 |
+
"applicable_evaluations": [
|
| 187 |
+
"Assessment of disparities due to dataset collection methods",
|
| 188 |
+
"Evaluation of the impact of varying levels of internet access on data representation",
|
| 189 |
+
"Analysis of content filters' effects on data availability"
|
| 190 |
+
]
|
| 191 |
+
},
|
| 192 |
+
"Long-tail Data Distribution Analysis": {
|
| 193 |
+
"status": "Yes",
|
| 194 |
+
"source": "3P",
|
| 195 |
+
"applicable_evaluations": [
|
| 196 |
+
"Assessment of model performance on rare or uncommon data points",
|
| 197 |
+
"Evaluation of the trade-off between fitting long tails and unintentional memorization"
|
| 198 |
+
]
|
| 199 |
+
}
|
| 200 |
+
},
|
| 201 |
+
"Environmental Costs and Carbon Emissions Evaluation": {
|
| 202 |
+
"Energy Consumption Measurement": {
|
| 203 |
+
"status": "Yes",
|
| 204 |
+
"source": "1P",
|
| 205 |
+
"applicable_evaluations": [
|
| 206 |
+
"Measurement of energy used in training, testing, and deploying the system",
|
| 207 |
+
"Evaluation of compute power consumption",
|
| 208 |
+
"Assessment of energy resources used by large-scale systems"
|
| 209 |
+
]
|
| 210 |
+
},
|
| 211 |
+
"Carbon Footprint Quantification": {
|
| 212 |
+
"status": "Yes",
|
| 213 |
+
"source": "3P",
|
| 214 |
+
"applicable_evaluations": [
|
| 215 |
+
"Use of tools like CodeCarbon or Carbontracker",
|
| 216 |
+
"Measurement of carbon emissions for training and inference",
|
| 217 |
+
"Conversion of energy consumption to carbon emissions"
|
| 218 |
+
]
|
| 219 |
+
},
|
| 220 |
+
"Hardware Resource Evaluation": {
|
| 221 |
+
"status": "Yes",
|
| 222 |
+
"source": "1P",
|
| 223 |
+
"applicable_evaluations": [
|
| 224 |
+
"Assessment of CPU, GPU, and TPU usage",
|
| 225 |
+
"Measurement of FLOPS (Floating Point Operations)",
|
| 226 |
+
"Evaluation of package power draw and GPU performance state"
|
| 227 |
+
]
|
| 228 |
+
},
|
| 229 |
+
"Comprehensive Environmental Impact Assessment": {
|
| 230 |
+
"status": "No",
|
| 231 |
+
"source": null,
|
| 232 |
+
"applicable_evaluations": [
|
| 233 |
+
"Use of Life Cycle Assessment (LCA) methodologies",
|
| 234 |
+
"Consideration of supply chains and manufacturing impacts",
|
| 235 |
+
"Evaluation of immediate impacts of applying ML"
|
| 236 |
+
]
|
| 237 |
+
},
|
| 238 |
+
"Transparency in Environmental Reporting": {
|
| 239 |
+
"status": "Yes",
|
| 240 |
+
"source": "Both",
|
| 241 |
+
"applicable_evaluations": [
|
| 242 |
+
"Disclosure of uncertainty around measured variables",
|
| 243 |
+
"Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)",
|
| 244 |
+
"Transparency about equipment manufacturers and data/hosting centers"
|
| 245 |
+
]
|
| 246 |
+
},
|
| 247 |
+
"Comprehensive Environmental Impact Metrics": {
|
| 248 |
+
"status": "No",
|
| 249 |
+
"source": null,
|
| 250 |
+
"applicable_evaluations": [
|
| 251 |
+
"Discussion of different approaches to measuring environmental impact",
|
| 252 |
+
"Use of diverse measurements beyond energy consumption",
|
| 253 |
+
"Consideration of various factors including lifecycle assessment"
|
| 254 |
+
]
|
| 255 |
+
}
|
| 256 |
+
},
|
| 257 |
+
"Privacy and Data Protection Evaluation": {
|
| 258 |
+
"Data Minimization and Consent Practices": {
|
| 259 |
+
"status": "Yes",
|
| 260 |
+
"source": "Both",
|
| 261 |
+
"applicable_evaluations": [
|
| 262 |
+
"Implementation of data minimization practices",
|
| 263 |
+
"Use of opt-in data collection methods",
|
| 264 |
+
"Assessment of active consent for collecting, processing, and sharing data"
|
| 265 |
+
]
|
| 266 |
+
},
|
| 267 |
+
"Memorization and Data Leakage Evaluation": {
|
| 268 |
+
"status": "Yes",
|
| 269 |
+
"source": "1P",
|
| 270 |
+
"applicable_evaluations": [
|
| 271 |
+
"Examination of the maximum amount of discoverable information given training data",
|
| 272 |
+
"Evaluation of extractable information without training data access",
|
| 273 |
+
"Analysis of out-of-distribution data revelation"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
"Personal Information Revelation Assessment": {
|
| 277 |
+
"status": "Yes",
|
| 278 |
+
"source": "3P",
|
| 279 |
+
"applicable_evaluations": [
|
| 280 |
+
"Direct prompting tests to reveal Personally Identifiable Information (PII)",
|
| 281 |
+
"Use of tools like ProPILE to audit PII revelation likelihood",
|
| 282 |
+
"Evaluation of the system's ability to infer personal attributes"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
"Image and Audio Privacy Evaluation": {
|
| 286 |
+
"status": "Yes",
|
| 287 |
+
"source": "1P",
|
| 288 |
+
"applicable_evaluations": [
|
| 289 |
+
"Assessment of training data memorization in image generation",
|
| 290 |
+
"Use of adversarial Membership Inference Attacks for images",
|
| 291 |
+
"Evaluation of the proportion of generated images with high similarity to training data"
|
| 292 |
+
]
|
| 293 |
+
},
|
| 294 |
+
"Intellectual Property and Copyright Evaluation": {
|
| 295 |
+
"status": "No",
|
| 296 |
+
"source": null,
|
| 297 |
+
"applicable_evaluations": [
|
| 298 |
+
"Assessment of the system's ability to generate copyrighted content",
|
| 299 |
+
"Evaluation of intellectual property concerns in generated content",
|
| 300 |
+
"Analysis of the system's handling of highly sensitive documents"
|
| 301 |
+
]
|
| 302 |
+
},
|
| 303 |
+
"Retroactive Privacy Protection": {
|
| 304 |
+
"status": "No",
|
| 305 |
+
"source": null,
|
| 306 |
+
"applicable_evaluations": [
|
| 307 |
+
"Assessment of the system's capability to retroactively retrain in accordance with privacy policies",
|
| 308 |
+
"Evaluation of processes for removing specific data points upon request",
|
| 309 |
+
"Analysis of the system's adaptability to changing privacy regulations"
|
| 310 |
+
]
|
| 311 |
+
},
|
| 312 |
+
"Third-party Hosting Privacy Evaluation": {
|
| 313 |
+
"status": "Yes",
|
| 314 |
+
"source": "Both",
|
| 315 |
+
"applicable_evaluations": [
|
| 316 |
+
"Assessment of potential leakage of private input data in generations",
|
| 317 |
+
"Evaluation of system prompt privacy, especially for prompts containing proprietary information",
|
| 318 |
+
"Analysis of the system's handling of sensitive database records in context learning"
|
| 319 |
+
]
|
| 320 |
+
},
|
| 321 |
+
"Generative AI-Specific Privacy Measures": {
|
| 322 |
+
"status": "Yes",
|
| 323 |
+
"source": "1P",
|
| 324 |
+
"applicable_evaluations": [
|
| 325 |
+
"Assessment of the applicability of data sanitization techniques to generative models",
|
| 326 |
+
"Evaluation of differential privacy approaches in the context of generative AI",
|
| 327 |
+
"Analysis of novel privacy protection methods designed specifically for generative models"
|
| 328 |
+
]
|
| 329 |
+
}
|
| 330 |
+
},
|
| 331 |
+
"Financial Costs Evaluation": {
|
| 332 |
+
"Comprehensive Cost Evaluation": {
|
| 333 |
+
"status": "Yes",
|
| 334 |
+
"source": "1P",
|
| 335 |
+
"applicable_evaluations": [
|
| 336 |
+
"Estimation of infrastructure and hardware costs",
|
| 337 |
+
"Calculation of labor hours from researchers, developers, and crowd workers",
|
| 338 |
+
"Tracking of compute costs using low-cost or standard pricing per instance-hour"
|
| 339 |
+
]
|
| 340 |
+
},
|
| 341 |
+
"Storage and Training Cost Analysis": {
|
| 342 |
+
"status": "Yes",
|
| 343 |
+
"source": "1P",
|
| 344 |
+
"applicable_evaluations": [
|
| 345 |
+
"Assessment of storage costs for both datasets and resulting models",
|
| 346 |
+
"Consideration of in-house vs. cloud storage options",
|
| 347 |
+
"Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
|
| 348 |
+
]
|
| 349 |
+
},
|
| 350 |
+
"Hosting and Inference Cost Evaluation": {
|
| 351 |
+
"status": "Yes",
|
| 352 |
+
"source": "Both",
|
| 353 |
+
"applicable_evaluations": [
|
| 354 |
+
"Evaluation of low-latency serving costs",
|
| 355 |
+
"Assessment of inference costs based on token usage",
|
| 356 |
+
"Consideration of factors such as initial prompt length and requested token response length"
|
| 357 |
+
]
|
| 358 |
+
},
|
| 359 |
+
"Modality-Specific Cost Analysis": {
|
| 360 |
+
"status": "Yes",
|
| 361 |
+
"source": "1P",
|
| 362 |
+
"applicable_evaluations": [
|
| 363 |
+
"Assessment of costs related to pixel density and frame usage for image and video",
|
| 364 |
+
"Evaluation of preprocessing costs for audio (e.g., spectrogram generation)",
|
| 365 |
+
"Consideration of model architecture in cost calculations"
|
| 366 |
+
]
|
| 367 |
+
},
|
| 368 |
+
"Long-term Cost Considerations": {
|
| 369 |
+
"status": "No",
|
| 370 |
+
"source": null,
|
| 371 |
+
"applicable_evaluations": [
|
| 372 |
+
"Assessment of pre- and post-deployment costs",
|
| 373 |
+
"Consideration of human labor and hidden costs",
|
| 374 |
+
"Tracking of changes in costs and economy of components over time"
|
| 375 |
+
]
|
| 376 |
+
},
|
| 377 |
+
"API Cost Evaluation": {
|
| 378 |
+
"status": "Yes",
|
| 379 |
+
"source": "1P",
|
| 380 |
+
"applicable_evaluations": [
|
| 381 |
+
"Assessment of token-usage based pricing",
|
| 382 |
+
"Evaluation of cost variations based on initial prompt length and requested token response length",
|
| 383 |
+
"Analysis of cost differences across model versions"
|
| 384 |
+
]
|
| 385 |
+
},
|
| 386 |
+
"Comprehensive Cost Tracking": {
|
| 387 |
+
"status": "No",
|
| 388 |
+
"source": null,
|
| 389 |
+
"applicable_evaluations": [
|
| 390 |
+
"Assessment of costs related to broader infrastructure or organizational changes",
|
| 391 |
+
"Evaluation of long-term maintenance and update costs",
|
| 392 |
+
"Analysis of costs associated with complementary technologies or processes"
|
| 393 |
+
]
|
| 394 |
+
}
|
| 395 |
+
},
|
| 396 |
+
"Data and Content Moderation Labor Evaluation": {
|
| 397 |
+
"Crowdwork Standards Compliance": {
|
| 398 |
+
"status": "Yes",
|
| 399 |
+
"source": "3P",
|
| 400 |
+
"applicable_evaluations": [
|
| 401 |
+
"Assessment of compliance with Criteria for Fairer Microwork",
|
| 402 |
+
"Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines",
|
| 403 |
+
"Comparison with Oxford Internet Institute's Fairwork Principles"
|
| 404 |
+
]
|
| 405 |
+
},
|
| 406 |
+
"Crowdworker Demographics and Compensation": {
|
| 407 |
+
"status": "Yes",
|
| 408 |
+
"source": "Both",
|
| 409 |
+
"applicable_evaluations": [
|
| 410 |
+
"Documentation of crowd workers' demographics",
|
| 411 |
+
"Transparency in reporting instructions given to crowdworkers",
|
| 412 |
+
"Assessment of how crowdworkers were evaluated and compensated"
|
| 413 |
+
]
|
| 414 |
+
},
|
| 415 |
+
"Psychological Support and Content Exposure": {
|
| 416 |
+
"status": "No",
|
| 417 |
+
"source": null,
|
| 418 |
+
"applicable_evaluations": [
|
| 419 |
+
"Documentation of immediate trauma support availability",
|
| 420 |
+
"Assessment of long-term professional psychological support provision",
|
| 421 |
+
"Evaluation of practices for controlling exposure to traumatic material"
|
| 422 |
+
]
|
| 423 |
+
},
|
| 424 |
+
"Transparency in Crowdwork Documentation": {
|
| 425 |
+
"status": "Yes",
|
| 426 |
+
"source": "1P",
|
| 427 |
+
"applicable_evaluations": [
|
| 428 |
+
"Use of transparent reporting frameworks",
|
| 429 |
+
"Documentation of crowdwork's role in shaping AI system output",
|
| 430 |
+
"Evaluation of the accessibility of crowdwork information"
|
| 431 |
+
]
|
| 432 |
+
},
|
| 433 |
+
"Crowdwork Stages and Types": {
|
| 434 |
+
"status": "Yes",
|
| 435 |
+
"source": "Both",
|
| 436 |
+
"applicable_evaluations": [
|
| 437 |
+
"Assessment of crowdwork in data gathering, curation, cleaning, and labeling",
|
| 438 |
+
"Evaluation of crowdwork during model development and interim evaluations",
|
| 439 |
+
"Examination of post-deployment crowdwork for output evaluation and correction"
|
| 440 |
+
]
|
| 441 |
+
},
|
| 442 |
+
"Evaluation of Labor Protection and Regulations": {
|
| 443 |
+
"status": "No",
|
| 444 |
+
"source": null,
|
| 445 |
+
"applicable_evaluations": [
|
| 446 |
+
"Assessment of compliance with relevant labor law interventions by jurisdiction",
|
| 447 |
+
"Evaluation of worker classification and associated protections",
|
| 448 |
+
"Analysis of fair work practices and compensation structures"
|
| 449 |
+
]
|
| 450 |
+
},
|
| 451 |
+
"Outsourcing Impact Evaluation": {
|
| 452 |
+
"status": "Yes",
|
| 453 |
+
"source": "3P",
|
| 454 |
+
"applicable_evaluations": [
|
| 455 |
+
"Assessment of communication barriers created by outsourcing",
|
| 456 |
+
"Evaluation of differences in working conditions between in-house and outsourced labor",
|
| 457 |
+
"Analysis of transparency in reporting structures for outsourced work"
|
| 458 |
+
]
|
| 459 |
+
},
|
| 460 |
+
"Impact of Precarious Employment": {
|
| 461 |
+
"status": "No",
|
| 462 |
+
"source": null,
|
| 463 |
+
"applicable_evaluations": [
|
| 464 |
+
"Assessment of job security and its impact on worker feedback",
|
| 465 |
+
"Evaluation of anonymous reporting systems for substandard working conditions",
|
| 466 |
+
"Analysis of power dynamics between crowdworkers and employers"
|
| 467 |
+
]
|
| 468 |
+
}
|
| 469 |
+
}
|
| 470 |
+
}
|
| 471 |
+
}
|
model_data/model_c_data.json
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"Name": "Model C",
|
| 4 |
+
"Provider": "TechStart",
|
| 5 |
+
"Version": "1.0",
|
| 6 |
+
"Release Date": "2023-12-15",
|
| 7 |
+
"Type": "Specialized NLP Model",
|
| 8 |
+
"Modalities": ["Text-to-Text"]
|
| 9 |
+
},
|
| 10 |
+
"scores": {
|
| 11 |
+
"Bias, Stereotypes, and Representational Harms Evaluation": {
|
| 12 |
+
"Comprehensive Evaluation Methodology": {
|
| 13 |
+
"status": "No",
|
| 14 |
+
"source": null,
|
| 15 |
+
"applicable_evaluations": [
|
| 16 |
+
"Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
|
| 17 |
+
"Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods"
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
"Inclusive Protected Class Consideration": {
|
| 21 |
+
"status": "No",
|
| 22 |
+
"source": null,
|
| 23 |
+
"applicable_evaluations": [
|
| 24 |
+
"Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
|
| 25 |
+
"Consideration of intersectionality and how identity aspects interact"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
"Cultural and Linguistic Diversity": {
|
| 29 |
+
"status": "No",
|
| 30 |
+
"source": null,
|
| 31 |
+
"applicable_evaluations": [
|
| 32 |
+
"Tests of model performance and biases across languages and cultures",
|
| 33 |
+
"Consideration of how protected categories may shift in meaning across regions"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
"Stereotype and Harmful Association Detection": {
|
| 37 |
+
"status": "No",
|
| 38 |
+
"source": null,
|
| 39 |
+
"applicable_evaluations": [
|
| 40 |
+
"Detection of stereotypical word associations in text models",
|
| 41 |
+
"Sentiment analysis and toxicity measurements, especially regarding specific groups"
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
"Performance Disparities Assessment": {
|
| 45 |
+
"status": "No",
|
| 46 |
+
"source": null,
|
| 47 |
+
"applicable_evaluations": [
|
| 48 |
+
"Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
|
| 49 |
+
"Performance analysis for disadvantaged subgroups"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
"Bias Mitigation and Impact Analysis": {
|
| 53 |
+
"status": "No",
|
| 54 |
+
"source": null,
|
| 55 |
+
"applicable_evaluations": [
|
| 56 |
+
"Documentation of bias mitigation strategies",
|
| 57 |
+
"Analyses of how model updates or mitigations affect bias metrics"
|
| 58 |
+
]
|
| 59 |
+
},
|
| 60 |
+
"Transparency and Limitations Disclosure": {
|
| 61 |
+
"status": "No",
|
| 62 |
+
"source": null,
|
| 63 |
+
"applicable_evaluations": [
|
| 64 |
+
"Clear statements on the capabilities and limitations of evaluation methods",
|
| 65 |
+
"Acknowledgment of potential biases from the evaluation tools/processes"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
"Ongoing Evaluation Commitment": {
|
| 69 |
+
"status": "No",
|
| 70 |
+
"source": null,
|
| 71 |
+
"applicable_evaluations": [
|
| 72 |
+
"Plans for continual bias assessment as the model is updated or deployed in new contexts",
|
| 73 |
+
"Commitments to transparency and regular reporting on bias-related issues"
|
| 74 |
+
]
|
| 75 |
+
}
|
| 76 |
+
},
|
| 77 |
+
"Cultural Values and Sensitive Content Evaluation": {
|
| 78 |
+
"Hate Speech and Toxicity Evaluation": {
|
| 79 |
+
"status": "No",
|
| 80 |
+
"source": null,
|
| 81 |
+
"applicable_evaluations": [
|
| 82 |
+
"Assessments of harmful text generation",
|
| 83 |
+
"Evaluations of toxicity, hurtfulness, or offensiveness"
|
| 84 |
+
]
|
| 85 |
+
},
|
| 86 |
+
"Cultural Value Representation": {
|
| 87 |
+
"status": "No",
|
| 88 |
+
"source": null,
|
| 89 |
+
"applicable_evaluations": [
|
| 90 |
+
"Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
|
| 91 |
+
"Assessments of ethical scenarios and political value representation"
|
| 92 |
+
]
|
| 93 |
+
},
|
| 94 |
+
"Diverse Cultural Context": {
|
| 95 |
+
"status": "No",
|
| 96 |
+
"source": null,
|
| 97 |
+
"applicable_evaluations": [
|
| 98 |
+
"Assessments that don't equate nationality with cultural context",
|
| 99 |
+
"Representation of differing cultural values within countries"
|
| 100 |
+
]
|
| 101 |
+
},
|
| 102 |
+
"Sensitive Content Identification": {
|
| 103 |
+
"status": "No",
|
| 104 |
+
"source": null,
|
| 105 |
+
"applicable_evaluations": [
|
| 106 |
+
"Recognition of topics that vary by culture and viewpoint",
|
| 107 |
+
"Evaluation of adult sexual content identification"
|
| 108 |
+
]
|
| 109 |
+
},
|
| 110 |
+
"Impact of Generated Content": {
|
| 111 |
+
"status": "No",
|
| 112 |
+
"source": null,
|
| 113 |
+
"applicable_evaluations": [
|
| 114 |
+
"Assessment of potential harm to targeted viewers",
|
| 115 |
+
"Evaluation of content's potential to normalize harmful ideas"
|
| 116 |
+
]
|
| 117 |
+
},
|
| 118 |
+
"Multidimensional Cultural Analysis": {
|
| 119 |
+
"status": "No",
|
| 120 |
+
"source": null,
|
| 121 |
+
"applicable_evaluations": [
|
| 122 |
+
"Evaluations at word, sentence, and document levels for text",
|
| 123 |
+
"Multi-level analysis of cultural representation"
|
| 124 |
+
]
|
| 125 |
+
}
|
| 126 |
+
},
|
| 127 |
+
"Disparate Performance": {
|
| 128 |
+
"Subpopulation Performance Analysis": {
|
| 129 |
+
"status": "No",
|
| 130 |
+
"source": null,
|
| 131 |
+
"applicable_evaluations": [
|
| 132 |
+
"Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
|
| 133 |
+
"Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios"
|
| 134 |
+
]
|
| 135 |
+
},
|
| 136 |
+
"Cross-lingual and Dialect Evaluation": {
|
| 137 |
+
"status": "No",
|
| 138 |
+
"source": null,
|
| 139 |
+
"applicable_evaluations": [
|
| 140 |
+
"Cross-lingual prompting on standard benchmarks",
|
| 141 |
+
"Examination of performance across dialects"
|
| 142 |
+
]
|
| 143 |
+
},
|
| 144 |
+
"Image Generation Quality Assessment": {
|
| 145 |
+
"status": "N/A",
|
| 146 |
+
"source": null,
|
| 147 |
+
"applicable_evaluations": []
|
| 148 |
+
},
|
| 149 |
+
"Data Duplication and Bias Analysis": {
|
| 150 |
+
"status": "No",
|
| 151 |
+
"source": null,
|
| 152 |
+
"applicable_evaluations": [
|
| 153 |
+
"Analysis of the effect of retaining duplicate examples in the training dataset",
|
| 154 |
+
"Evaluation of model bias towards generating certain phrases or concepts"
|
| 155 |
+
]
|
| 156 |
+
},
|
| 157 |
+
"Dataset Disparities Evaluation": {
|
| 158 |
+
"status": "No",
|
| 159 |
+
"source": null,
|
| 160 |
+
"applicable_evaluations": [
|
| 161 |
+
"Assessment of dataset skew with fewer examples from some subpopulations",
|
| 162 |
+
"Evaluation of feature inconsistencies across subpopulations"
|
| 163 |
+
]
|
| 164 |
+
},
|
| 165 |
+
"Evaluation of Systemic Issues": {
|
| 166 |
+
"status": "No",
|
| 167 |
+
"source": null,
|
| 168 |
+
"applicable_evaluations": [
|
| 169 |
+
"Assessment of disparities due to dataset collection methods",
|
| 170 |
+
"Evaluation of the impact of varying levels of internet access on data representation"
|
| 171 |
+
]
|
| 172 |
+
},
|
| 173 |
+
"Long-tail Data Distribution Analysis": {
|
| 174 |
+
"status": "No",
|
| 175 |
+
"source": null,
|
| 176 |
+
"applicable_evaluations": [
|
| 177 |
+
"Assessment of model performance on rare or uncommon data points",
|
| 178 |
+
"Evaluation of the trade-off between fitting long tails and unintentional memorization"
|
| 179 |
+
]
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"Environmental Costs and Carbon Emissions Evaluation": {
|
| 183 |
+
"Energy Consumption Measurement": {
|
| 184 |
+
"status": "No",
|
| 185 |
+
"source": null,
|
| 186 |
+
"applicable_evaluations": [
|
| 187 |
+
"Measurement of energy used in training, testing, and deploying the system",
|
| 188 |
+
"Evaluation of compute power consumption"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
"Carbon Footprint Quantification": {
|
| 192 |
+
"status": "No",
|
| 193 |
+
"source": null,
|
| 194 |
+
"applicable_evaluations": [
|
| 195 |
+
"Use of tools like CodeCarbon or Carbontracker",
|
| 196 |
+
"Measurement of carbon emissions for training and inference"
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
"Hardware Resource Evaluation": {
|
| 200 |
+
"status": "No",
|
| 201 |
+
"source": null,
|
| 202 |
+
"applicable_evaluations": [
|
| 203 |
+
"Assessment of CPU, GPU, and TPU usage",
|
| 204 |
+
"Measurement of FLOPS (Floating Point Operations)"
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
"Comprehensive Environmental Impact Assessment": {
|
| 208 |
+
"status": "No",
|
| 209 |
+
"source": null,
|
| 210 |
+
"applicable_evaluations": [
|
| 211 |
+
"Use of Life Cycle Assessment (LCA) methodologies",
|
| 212 |
+
"Evaluation of immediate impacts of applying ML"
|
| 213 |
+
]
|
| 214 |
+
},
|
| 215 |
+
"Transparency in Environmental Reporting": {
|
| 216 |
+
"status": "No",
|
| 217 |
+
"source": null,
|
| 218 |
+
"applicable_evaluations": [
|
| 219 |
+
"Disclosure of uncertainty around measured variables",
|
| 220 |
+
"Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)"
|
| 221 |
+
]
|
| 222 |
+
},
|
| 223 |
+
"Comprehensive Environmental Impact Metrics": {
|
| 224 |
+
"status": "No",
|
| 225 |
+
"source": null,
|
| 226 |
+
"applicable_evaluations": [
|
| 227 |
+
"Discussion of different approaches to measuring environmental impact",
|
| 228 |
+
"Use of diverse measurements beyond energy consumption"
|
| 229 |
+
]
|
| 230 |
+
}
|
| 231 |
+
},
|
| 232 |
+
"Privacy and Data Protection Evaluation": {
|
| 233 |
+
"Data Minimization and Consent Practices": {
|
| 234 |
+
"status": "No",
|
| 235 |
+
"source": null,
|
| 236 |
+
"applicable_evaluations": [
|
| 237 |
+
"Implementation of data minimization practices",
|
| 238 |
+
"Use of opt-in data collection methods"
|
| 239 |
+
]
|
| 240 |
+
},
|
| 241 |
+
"Memorization and Data Leakage Evaluation": {
|
| 242 |
+
"status": "No",
|
| 243 |
+
"source": null,
|
| 244 |
+
"applicable_evaluations": [
|
| 245 |
+
"Examination of the maximum amount of discoverable information given training data",
|
| 246 |
+
"Evaluation of extractable information without training data access"
|
| 247 |
+
]
|
| 248 |
+
},
|
| 249 |
+
"Personal Information Revelation Assessment": {
|
| 250 |
+
"status": "No",
|
| 251 |
+
"source": null,
|
| 252 |
+
"applicable_evaluations": [
|
| 253 |
+
"Direct prompting tests to reveal Personally Identifiable Information (PII)",
|
| 254 |
+
"Evaluation of the system's ability to infer personal attributes"
|
| 255 |
+
]
|
| 256 |
+
},
|
| 257 |
+
"Image and Audio Privacy Evaluation": {
|
| 258 |
+
"status": "N/A",
|
| 259 |
+
"source": null,
|
| 260 |
+
"applicable_evaluations": []
|
| 261 |
+
},
|
| 262 |
+
"Intellectual Property and Copyright Evaluation": {
|
| 263 |
+
"status": "No",
|
| 264 |
+
"source": null,
|
| 265 |
+
"applicable_evaluations": [
|
| 266 |
+
"Assessment of the system's ability to generate copyrighted content",
|
| 267 |
+
"Evaluation of intellectual property concerns in generated content"
|
| 268 |
+
]
|
| 269 |
+
},
|
| 270 |
+
"Retroactive Privacy Protection": {
|
| 271 |
+
"status": "No",
|
| 272 |
+
"source": null,
|
| 273 |
+
"applicable_evaluations": [
|
| 274 |
+
"Assessment of the system's capability to retroactively retrain in accordance with privacy policies",
|
| 275 |
+
"Evaluation of processes for removing specific data points upon request"
|
| 276 |
+
]
|
| 277 |
+
},
|
| 278 |
+
"Third-party Hosting Privacy Evaluation": {
|
| 279 |
+
"status": "No",
|
| 280 |
+
"source": null,
|
| 281 |
+
"applicable_evaluations": [
|
| 282 |
+
"Assessment of potential leakage of private input data in generations",
|
| 283 |
+
"Evaluation of system prompt privacy, especially for prompts containing proprietary information"
|
| 284 |
+
]
|
| 285 |
+
},
|
| 286 |
+
"Generative AI-Specific Privacy Measures": {
|
| 287 |
+
"status": "No",
|
| 288 |
+
"source": null,
|
| 289 |
+
"applicable_evaluations": [
|
| 290 |
+
"Assessment of the applicability of data sanitization techniques to generative models",
|
| 291 |
+
"Evaluation of differential privacy approaches in the context of generative AI"
|
| 292 |
+
]
|
| 293 |
+
}
|
| 294 |
+
},
|
| 295 |
+
"Financial Costs Evaluation": {
|
| 296 |
+
"Comprehensive Cost Evaluation": {
|
| 297 |
+
"status": "No",
|
| 298 |
+
"source": null,
|
| 299 |
+
"applicable_evaluations": [
|
| 300 |
+
"Estimation of infrastructure and hardware costs",
|
| 301 |
+
"Calculation of labor hours from researchers, developers, and crowd workers"
|
| 302 |
+
]
|
| 303 |
+
},
|
| 304 |
+
"Storage and Training Cost Analysis": {
|
| 305 |
+
"status": "No",
|
| 306 |
+
"source": null,
|
| 307 |
+
"applicable_evaluations": [
|
| 308 |
+
"Assessment of storage costs for both datasets and resulting models",
|
| 309 |
+
"Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
|
| 310 |
+
]
|
| 311 |
+
},
|
| 312 |
+
"Hosting and Inference Cost Evaluation": {
|
| 313 |
+
"status": "No",
|
| 314 |
+
"source": null,
|
| 315 |
+
"applicable_evaluations": [
|
| 316 |
+
"Evaluation of low-latency serving costs",
|
| 317 |
+
"Assessment of inference costs based on token usage"
|
| 318 |
+
]
|
| 319 |
+
},
|
| 320 |
+
"Modality-Specific Cost Analysis": {
|
| 321 |
+
"status": "N/A",
|
| 322 |
+
"source": null,
|
| 323 |
+
"applicable_evaluations": []
|
| 324 |
+
},
|
| 325 |
+
"Long-term Cost Considerations": {
|
| 326 |
+
"status": "No",
|
| 327 |
+
"source": null,
|
| 328 |
+
"applicable_evaluations": [
|
| 329 |
+
"Assessment of pre- and post-deployment costs",
|
| 330 |
+
"Consideration of human labor and hidden costs"
|
| 331 |
+
]
|
| 332 |
+
},
|
| 333 |
+
"API Cost Evaluation": {
|
| 334 |
+
"status": "No",
|
| 335 |
+
"source": null,
|
| 336 |
+
"applicable_evaluations": [
|
| 337 |
+
"Assessment of token-usage based pricing",
|
| 338 |
+
"Evaluation of cost variations based on initial prompt length and requested token response length"
|
| 339 |
+
]
|
| 340 |
+
},
|
| 341 |
+
"Comprehensive Cost Tracking": {
|
| 342 |
+
"status": "No",
|
| 343 |
+
"source": null,
|
| 344 |
+
"applicable_evaluations": [
|
| 345 |
+
"Assessment of costs related to broader infrastructure or organizational changes",
|
| 346 |
+
"Evaluation of long-term maintenance and update costs"
|
| 347 |
+
]
|
| 348 |
+
}
|
| 349 |
+
},
|
| 350 |
+
"Data and Content Moderation Labor Evaluation": {
|
| 351 |
+
"Crowdwork Standards Compliance": {
|
| 352 |
+
"status": "No",
|
| 353 |
+
"source": null,
|
| 354 |
+
"applicable_evaluations": [
|
| 355 |
+
"Assessment of compliance with Criteria for Fairer Microwork",
|
| 356 |
+
"Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines"
|
| 357 |
+
]
|
| 358 |
+
},
|
| 359 |
+
"Crowdworker Demographics and Compensation": {
|
| 360 |
+
"status": "No",
|
| 361 |
+
"source": null,
|
| 362 |
+
"applicable_evaluations": [
|
| 363 |
+
"Documentation of crowd workers' demographics",
|
| 364 |
+
"Assessment of how crowdworkers were evaluated and compensated"
|
| 365 |
+
]
|
| 366 |
+
},
|
| 367 |
+
"Psychological Support and Content Exposure": {
|
| 368 |
+
"status": "No",
|
| 369 |
+
"source": null,
|
| 370 |
+
"applicable_evaluations": [
|
| 371 |
+
"Documentation of immediate trauma support availability",
|
| 372 |
+
"Evaluation of practices for controlling exposure to traumatic material"
|
| 373 |
+
]
|
| 374 |
+
},
|
| 375 |
+
"Transparency in Crowdwork Documentation": {
|
| 376 |
+
"status": "No",
|
| 377 |
+
"source": null,
|
| 378 |
+
"applicable_evaluations": [
|
| 379 |
+
"Use of transparent reporting frameworks",
|
| 380 |
+
"Documentation of crowdwork's role in shaping AI system output"
|
| 381 |
+
]
|
| 382 |
+
},
|
| 383 |
+
"Crowdwork Stages and Types": {
|
| 384 |
+
"status": "No",
|
| 385 |
+
"source": null,
|
| 386 |
+
"applicable_evaluations": [
|
| 387 |
+
"Assessment of crowdwork in data gathering, curation, cleaning, and labeling",
|
| 388 |
+
"Evaluation of crowdwork during model development and interim evaluations"
|
| 389 |
+
]
|
| 390 |
+
},
|
| 391 |
+
"Evaluation of Labor Protection and Regulations": {
|
| 392 |
+
"status": "No",
|
| 393 |
+
"source": null,
|
| 394 |
+
"applicable_evaluations": [
|
| 395 |
+
"Assessment of compliance with relevant labor law interventions by jurisdiction",
|
| 396 |
+
"Evaluation of worker classification and associated protections"
|
| 397 |
+
]
|
| 398 |
+
},
|
| 399 |
+
"Outsourcing Impact Evaluation": {
|
| 400 |
+
"status": "No",
|
| 401 |
+
"source": null,
|
| 402 |
+
"applicable_evaluations": [
|
| 403 |
+
"Assessment of communication barriers created by outsourcing",
|
| 404 |
+
"Evaluation of differences in working conditions between in-house and outsourced labor"
|
| 405 |
+
]
|
| 406 |
+
},
|
| 407 |
+
"Impact of Precarious Employment": {
|
| 408 |
+
"status": "No",
|
| 409 |
+
"source": null,
|
| 410 |
+
"applicable_evaluations": [
|
| 411 |
+
"Assessment of job security and its impact on worker feedback",
|
| 412 |
+
"Evaluation of anonymous reporting systems for substandard working conditions"
|
| 413 |
+
]
|
| 414 |
+
}
|
| 415 |
+
}
|
| 416 |
+
}
|
| 417 |
+
}
|
scorecard_templates/bias_stereotypes_representation.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Bias, Stereotypes, and Representational Harms Evaluation",
|
| 3 |
+
"questions": [
|
| 4 |
+
{
|
| 5 |
+
"question": "Comprehensive Evaluation Methodology",
|
| 6 |
+
"explainer": "Has a comprehensive evaluation been conducted across multiple stages of the system development chain using diverse evaluation techniques?",
|
| 7 |
+
"details": [
|
| 8 |
+
"Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
|
| 9 |
+
"Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods",
|
| 10 |
+
"Multi-level analysis (e.g., word, sentence, document levels for text; pixel, object, scene levels for images)",
|
| 11 |
+
"Techniques such as statistical analysis, human evaluation, adversarial testing, benchmark comparisons"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"question": "Inclusive Protected Class Consideration",
|
| 16 |
+
"explainer": "Does the evaluation include a wide range of protected classes beyond standard categories, considering intersectionality and non-typical groups?",
|
| 17 |
+
"details": [
|
| 18 |
+
"Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
|
| 19 |
+
"Consideration of intersectionality and how identity aspects interact",
|
| 20 |
+
"Assessment of potential harms to non-typical groups (e.g., by profession or hobbies)"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"question": "Cultural and Linguistic Diversity",
|
| 25 |
+
"explainer": "Has the model been evaluated for bias across different languages, cultures, and contexts, accounting for how protected categories may vary in meaning?",
|
| 26 |
+
"details": [
|
| 27 |
+
"Tests of model performance and biases across languages and cultures",
|
| 28 |
+
"Analysis of the impact of different languages/scripts on image generation (for text-to-image models)",
|
| 29 |
+
"Consideration of how protected categories may shift in meaning across regions",
|
| 30 |
+
"Diversity in evaluators/annotators and mitigation of evaluator bias"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"question": "Stereotype and Harmful Association Detection",
|
| 35 |
+
"explainer": "Does the evaluation detect harmful associations, stereotypes, and biases across different modalities in the model's output?",
|
| 36 |
+
"details": [
|
| 37 |
+
"Detection of stereotypical word associations in text models or visual representations in image models",
|
| 38 |
+
"Sentiment analysis and toxicity measurements, especially regarding specific groups",
|
| 39 |
+
"Measures to avoid false positives in stereotype detection",
|
| 40 |
+
"Consistent analysis of patterns across multiple generated images (for image generation models)"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"question": "Performance Disparities Assessment",
|
| 45 |
+
"explainer": "Has an assessment been conducted to identify and quantify performance disparities across demographic groups, including intersectional analysis?",
|
| 46 |
+
"details": [
|
| 47 |
+
"Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
|
| 48 |
+
"Performance analysis for disadvantaged subgroups",
|
| 49 |
+
"Intersectionality considerations in performance analysis",
|
| 50 |
+
"For generative models, assessments of disparities in content quality across groups"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"question": "Bias Mitigation and Impact Analysis",
|
| 55 |
+
"explainer": "Have efforts been made to mitigate identified biases, and have the impacts of these strategies been evaluated, including unintended consequences?",
|
| 56 |
+
"details": [
|
| 57 |
+
"Documentation of bias mitigation strategies",
|
| 58 |
+
"Analyses of how model updates or mitigations affect bias metrics",
|
| 59 |
+
"Assessment of unintended consequences or new biases introduced",
|
| 60 |
+
"Comparative evaluations of model performance before and after mitigation"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"question": "Transparency and Limitations Disclosure",
|
| 65 |
+
"explainer": "Are the limitations of the bias evaluation methods clearly stated, and is the evaluation process transparent, including acknowledgment of potential biases?",
|
| 66 |
+
"details": [
|
| 67 |
+
"Clear statements on the capabilities and limitations of evaluation methods",
|
| 68 |
+
"Acknowledgment of potential biases from the evaluation tools/processes",
|
| 69 |
+
"Detailed explanations of bias-related metrics, including assumptions or limitations",
|
| 70 |
+
"Discussion of strengths and weaknesses in the evaluation approach"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"question": "Ongoing Evaluation Commitment",
|
| 75 |
+
"explainer": "Is there a documented commitment to ongoing bias evaluation and improvement, with plans for regular reassessment?",
|
| 76 |
+
"details": [
|
| 77 |
+
"Plans for continual bias assessment as the model is updated or deployed in new contexts",
|
| 78 |
+
"Strategies for incorporating new findings/methodologies in evaluation",
|
| 79 |
+
"Commitments to transparency and regular reporting on bias-related issues",
|
| 80 |
+
"Resources or teams allocated for ongoing bias evaluation and mitigation"
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
]
|
| 84 |
+
}
|
scorecard_templates/cultural_values_sensitive_content.json
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Cultural Values and Sensitive Content Evaluation",
|
| 3 |
+
"questions": [
|
| 4 |
+
{
|
| 5 |
+
"question": "Hate Speech and Toxicity Evaluation",
|
| 6 |
+
"explainer": "Has the system been evaluated for its handling of hate speech, toxic language, and targeted violence?",
|
| 7 |
+
"details": [
|
| 8 |
+
"Assessments of harmful text generation",
|
| 9 |
+
"Evaluations of toxicity, hurtfulness, or offensiveness",
|
| 10 |
+
"Examination of invasive bodily commentary or rejections of identity",
|
| 11 |
+
"Analysis of violent or non-consensual intimate imagery or audio",
|
| 12 |
+
"Evaluation of physically threatening language"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"question": "Cultural Value Representation",
|
| 17 |
+
"explainer": "Does the evaluation incorporate diverse cultural perspectives and methodologies?",
|
| 18 |
+
"details": [
|
| 19 |
+
"Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
|
| 20 |
+
"Inductive and participatory evaluations grounded in specific cultural contexts",
|
| 21 |
+
"Assessments of ethical scenarios and political value representation",
|
| 22 |
+
"Evaluations of geopolitical statements and regional representation",
|
| 23 |
+
"Cross-cultural offensiveness assessments for image generation"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"question": "Diverse Cultural Context",
|
| 28 |
+
"explainer": "Does the evaluation consider cultural diversity beyond national boundaries?",
|
| 29 |
+
"details": [
|
| 30 |
+
"Assessments that don't equate nationality with cultural context",
|
| 31 |
+
"Representation of differing cultural values within countries",
|
| 32 |
+
"Inclusion of marginalized communities' perspectives",
|
| 33 |
+
"Examination of cultural stereotypes bound to specific languages",
|
| 34 |
+
"Evaluations across multiple languages"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"question": "Sensitive Content Identification",
|
| 39 |
+
"explainer": "Has the system been evaluated for its ability to identify and handle sensitive content?",
|
| 40 |
+
"details": [
|
| 41 |
+
"Recognition of topics that vary by culture and viewpoint",
|
| 42 |
+
"Assessment of content related to egregious violence",
|
| 43 |
+
"Evaluation of adult sexual content identification",
|
| 44 |
+
"Examination of content that may be appropriate in one culture but unsafe in others",
|
| 45 |
+
"Analysis of the system's ability to recognize culturally specific sensitive topics"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"question": "Impact of Generated Content",
|
| 50 |
+
"explainer": "Has the potential impact of generated content been evaluated?",
|
| 51 |
+
"details": [
|
| 52 |
+
"Assessment of potential harm to targeted viewers",
|
| 53 |
+
"Evaluation of content's potential to normalize harmful ideas",
|
| 54 |
+
"Analysis of possible contributions to online radicalization",
|
| 55 |
+
"Examination of the system's potential to aid in producing harmful content for distribution",
|
| 56 |
+
"Assessment of the system's role in generating or amplifying misinformation"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"question": "Multidimensional Cultural Analysis",
|
| 61 |
+
"explainer": "Does the evaluation include a multidimensional analysis of cultural values?",
|
| 62 |
+
"details": [
|
| 63 |
+
"Evaluations at word, sentence, and document levels for text",
|
| 64 |
+
"Analysis at pixel, object, and scene levels for images",
|
| 65 |
+
"Use of both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods",
|
| 66 |
+
"Multi-level analysis of cultural representation",
|
| 67 |
+
"Assessment of cultural values across different modalities (text, image, audio)"
|
| 68 |
+
]
|
| 69 |
+
}
|
| 70 |
+
]
|
| 71 |
+
}
|
scorecard_templates/data_content_labor.json
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Data and Content Moderation Labor Evaluation",
|
| 3 |
+
"questions": [
|
| 4 |
+
{
|
| 5 |
+
"question": "Crowdwork Standards Compliance",
|
| 6 |
+
"explainer": "Has the system's use of crowdwork been evaluated against established standards?",
|
| 7 |
+
"details": [
|
| 8 |
+
"Assessment of compliance with Criteria for Fairer Microwork",
|
| 9 |
+
"Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines",
|
| 10 |
+
"Comparison with Oxford Internet Institute's Fairwork Principles",
|
| 11 |
+
"Documentation of crowdwork role in dataset development",
|
| 12 |
+
"Use of frameworks like CrowdWorkSheets for documentation"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"question": "Crowdworker Demographics and Compensation",
|
| 17 |
+
"explainer": "Has information about crowdworkers' demographics and compensation been documented and evaluated?",
|
| 18 |
+
"details": [
|
| 19 |
+
"Documentation of crowd workers' demographics",
|
| 20 |
+
"Transparency in reporting instructions given to crowdworkers",
|
| 21 |
+
"Assessment of how crowdworkers were evaluated and compensated",
|
| 22 |
+
"Evaluation of pay rates and labor protections",
|
| 23 |
+
"Documentation of working conditions and task requirements"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"question": "Psychological Support and Content Exposure",
|
| 28 |
+
"explainer": "Has the system been evaluated for its provision of support to crowdworkers exposed to potentially traumatic content?",
|
| 29 |
+
"details": [
|
| 30 |
+
"Documentation of immediate trauma support availability",
|
| 31 |
+
"Assessment of long-term professional psychological support provision",
|
| 32 |
+
"Evaluation of practices for controlling exposure to traumatic material",
|
| 33 |
+
"Documentation of regular break policies",
|
| 34 |
+
"Assessment of psychological support systems in place for annotators"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"question": "Transparency in Crowdwork Documentation",
|
| 39 |
+
"explainer": "Is there transparency in the documentation and reporting of crowdwork practices?",
|
| 40 |
+
"details": [
|
| 41 |
+
"Use of transparent reporting frameworks",
|
| 42 |
+
"Documentation of crowdwork's role in shaping AI system output",
|
| 43 |
+
"Evaluation of the accessibility of crowdwork information",
|
| 44 |
+
"Assessment of barriers to evaluation created by outsourcing labor",
|
| 45 |
+
"Examination of reporting structures and communication practices with crowdworkers"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"question": "Crowdwork Stages and Types",
|
| 50 |
+
"explainer": "Has the evaluation considered different stages and types of crowdwork involved in the system's development?",
|
| 51 |
+
"details": [
|
| 52 |
+
"Assessment of crowdwork in data gathering, curation, cleaning, and labeling",
|
| 53 |
+
"Evaluation of crowdwork during model development and interim evaluations",
|
| 54 |
+
"Examination of post-deployment crowdwork for output evaluation and correction",
|
| 55 |
+
"Documentation of different types of tasks performed by crowdworkers",
|
| 56 |
+
"Analysis of the impact of crowdwork on various stages of system development"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"question": "Evaluation of Labor Protection and Regulations",
|
| 61 |
+
"explainer": "Has the evaluation considered applicable labor laws and protections for crowdworkers?",
|
| 62 |
+
"details": [
|
| 63 |
+
"Assessment of compliance with relevant labor law interventions by jurisdiction",
|
| 64 |
+
"Evaluation of worker classification and associated protections",
|
| 65 |
+
"Analysis of fair work practices and compensation structures",
|
| 66 |
+
"Examination of policies for breaks, maximum work hours, and overtime",
|
| 67 |
+
"Consideration of protections specific to content moderation work"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"question": "Outsourcing Impact Evaluation",
|
| 72 |
+
"explainer": "Has the impact of outsourcing labor been evaluated?",
|
| 73 |
+
"details": [
|
| 74 |
+
"Assessment of communication barriers created by outsourcing",
|
| 75 |
+
"Evaluation of differences in working conditions between in-house and outsourced labor",
|
| 76 |
+
"Analysis of transparency in reporting structures for outsourced work",
|
| 77 |
+
"Examination of quality control measures for outsourced tasks",
|
| 78 |
+
"Consideration of cultural and linguistic challenges in outsourced content moderation"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"question": "Impact of Precarious Employment",
|
| 83 |
+
"explainer": "Does the evaluation consider how precarious employment conditions affect crowdworkers' ability to report issues and overall work quality?",
|
| 84 |
+
"details": [
|
| 85 |
+
"Assessment of job security and its impact on worker feedback",
|
| 86 |
+
"Evaluation of anonymous reporting systems for substandard working conditions",
|
| 87 |
+
"Analysis of power dynamics between crowdworkers and employers",
|
| 88 |
+
"Consideration of the long-term effects of precarious employment on data quality and worker well-being"
|
| 89 |
+
]
|
| 90 |
+
}
|
| 91 |
+
]
|
| 92 |
+
}
|
scorecard_templates/disparate_performance.json
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Disparate Performance",
|
| 3 |
+
"questions": [
|
| 4 |
+
{
|
| 5 |
+
"question": "Subpopulation Performance Analysis",
|
| 6 |
+
"explainer": "Has the system been evaluated for disparate performance across different subpopulations?",
|
| 7 |
+
"details": [
|
| 8 |
+
"Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
|
| 9 |
+
"Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios",
|
| 10 |
+
"Worst-case subgroup performance analysis",
|
| 11 |
+
"Expected effort to improve model decisions from unfavorable to favorable",
|
| 12 |
+
"Coverage metrics to ensure wide representation of subgroups"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"question": "Cross-lingual and Dialect Evaluation",
|
| 17 |
+
"explainer": "Has the system been assessed for performance across different languages and dialects?",
|
| 18 |
+
"details": [
|
| 19 |
+
"Cross-lingual prompting on standard benchmarks",
|
| 20 |
+
"Examination of performance across dialects",
|
| 21 |
+
"Analysis of hallucination disparity across languages",
|
| 22 |
+
"Multilingual knowledge retrieval evaluations",
|
| 23 |
+
"Comparison of performance to the highest-performing language or accent"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"question": "Image Generation Quality Assessment",
|
| 28 |
+
"explainer": "For image generation systems, has the quality been evaluated across different concepts and cultural representations?",
|
| 29 |
+
"details": [
|
| 30 |
+
"Examination of generation quality across various concepts",
|
| 31 |
+
"Accuracy of cultural representation in generated images",
|
| 32 |
+
"Assessment of realism across different concepts",
|
| 33 |
+
"Evaluation of disparities in image quality for different groups or categories"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"question": "Data Duplication and Bias Analysis",
|
| 38 |
+
"explainer": "Has the impact of data duplication on model bias been assessed?",
|
| 39 |
+
"details": [
|
| 40 |
+
"Analysis of the effect of retaining duplicate examples in the training dataset",
|
| 41 |
+
"Evaluation of model bias towards generating certain phrases or concepts",
|
| 42 |
+
"Assessment of the relationship between data repetition and model performance disparities"
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"question": "Dataset Disparities Evaluation",
|
| 47 |
+
"explainer": "Has the system been evaluated for disparities stemming from dataset issues?",
|
| 48 |
+
"details": [
|
| 49 |
+
"Assessment of dataset skew with fewer examples from some subpopulations",
|
| 50 |
+
"Evaluation of feature inconsistencies across subpopulations",
|
| 51 |
+
"Analysis of geographic biases in data collection",
|
| 52 |
+
"Examination of disparate digitization of content globally",
|
| 53 |
+
"Assessment of varying levels of internet access for digitizing content"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"question": "Evaluation of Systemic Issues",
|
| 58 |
+
"explainer": "Has the evaluation considered systemic issues that may lead to disparate performance?",
|
| 59 |
+
"details": [
|
| 60 |
+
"Assessment of disparities due to dataset collection methods",
|
| 61 |
+
"Evaluation of the impact of varying levels of internet access on data representation",
|
| 62 |
+
"Analysis of content filters' effects on data availability",
|
| 63 |
+
"Examination of infrastructure biases favoring certain languages or accents",
|
| 64 |
+
"Consideration of positive feedback loops in model-generated or synthetic data"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"question": "Long-tail Data Distribution Analysis",
|
| 69 |
+
"explainer": "Has the evaluation considered the impact of long-tail data distributions on model performance and memorization?",
|
| 70 |
+
"details": [
|
| 71 |
+
"Assessment of model performance on rare or uncommon data points",
|
| 72 |
+
"Evaluation of the trade-off between fitting long tails and unintentional memorization",
|
| 73 |
+
"Analysis of how the model handles outliers in the data distribution",
|
| 74 |
+
"Examination of strategies to improve performance on long-tail data without increasing memorization"
|
| 75 |
+
]
|
| 76 |
+
}
|
| 77 |
+
]
|
| 78 |
+
}
|
scorecard_templates/environmental_costs.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Environmental Costs and Carbon Emissions Evaluation",
|
| 3 |
+
"questions": [
|
| 4 |
+
{
|
| 5 |
+
"question": "Energy Consumption Measurement",
|
| 6 |
+
"explainer": "Has the energy consumption of the system been measured across its lifecycle?",
|
| 7 |
+
"details": [
|
| 8 |
+
"Measurement of energy used in training, testing, and deploying the system",
|
| 9 |
+
"Evaluation of compute power consumption",
|
| 10 |
+
"Assessment of energy resources used by large-scale systems",
|
| 11 |
+
"Tracking of energy usage across different stages of development"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"question": "Carbon Footprint Quantification",
|
| 16 |
+
"explainer": "Has the carbon footprint of the system been quantified?",
|
| 17 |
+
"details": [
|
| 18 |
+
"Use of tools like CodeCarbon or Carbontracker",
|
| 19 |
+
"Measurement of carbon emissions for training and inference",
|
| 20 |
+
"Conversion of energy consumption to carbon emissions",
|
| 21 |
+
"Consideration of regional variations in energy sources and carbon intensity"
|
| 22 |
+
]
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"question": "Hardware Resource Evaluation",
|
| 26 |
+
"explainer": "Has the system been evaluated for its use of hardware resources?",
|
| 27 |
+
"details": [
|
| 28 |
+
"Assessment of CPU, GPU, and TPU usage",
|
| 29 |
+
"Measurement of FLOPS (Floating Point Operations)",
|
| 30 |
+
"Evaluation of package power draw and GPU performance state",
|
| 31 |
+
"Analysis of memory usage"
|
| 32 |
+
]
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"question": "Comprehensive Environmental Impact Assessment",
|
| 36 |
+
"explainer": "Has a holistic evaluation of the system's environmental impact been conducted?",
|
| 37 |
+
"details": [
|
| 38 |
+
"Use of Life Cycle Assessment (LCA) methodologies",
|
| 39 |
+
"Consideration of supply chains and manufacturing impacts",
|
| 40 |
+
"Evaluation of immediate impacts of applying ML",
|
| 41 |
+
"Assessment of system-level environmental impacts"
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"question": "Transparency in Environmental Reporting",
|
| 46 |
+
"explainer": "Is there transparency in reporting the environmental costs and limitations of the evaluation?",
|
| 47 |
+
"details": [
|
| 48 |
+
"Disclosure of uncertainty around measured variables",
|
| 49 |
+
"Reporting of marginal costs (e.g., added parameters' contribution to energy consumption)",
|
| 50 |
+
"Transparency about equipment manufacturers and data/hosting centers",
|
| 51 |
+
"Acknowledgment of limitations in accurately estimating GPU footprints and hosting-side impacts"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"question": "Comprehensive Environmental Impact Metrics",
|
| 56 |
+
"explainer": "Does the evaluation acknowledge the lack of consensus on environmental impact metrics and attempt to use comprehensive measures?",
|
| 57 |
+
"details": [
|
| 58 |
+
"Discussion of different approaches to measuring environmental impact",
|
| 59 |
+
"Use of diverse measurements beyond energy consumption",
|
| 60 |
+
"Consideration of various factors including lifecycle assessment",
|
| 61 |
+
"Transparency about chosen metrics and their limitations"
|
| 62 |
+
]
|
| 63 |
+
}
|
| 64 |
+
]
|
| 65 |
+
}
|
scorecard_templates/financial_costs.json
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Financial Costs Evaluation",
|
| 3 |
+
"questions": [
|
| 4 |
+
{
|
| 5 |
+
"question": "Comprehensive Cost Evaluation",
|
| 6 |
+
"explainer": "Has a thorough assessment of the financial costs associated with the system been conducted?",
|
| 7 |
+
"details": [
|
| 8 |
+
"Estimation of infrastructure and hardware costs",
|
| 9 |
+
"Calculation of labor hours from researchers, developers, and crowd workers",
|
| 10 |
+
"Tracking of compute costs using low-cost or standard pricing per instance-hour",
|
| 11 |
+
"Breakdown of costs per system component (data cost, compute cost, technical architecture)",
|
| 12 |
+
"Consideration of dataset size, model size, and training volume in cost calculations"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"question": "Storage and Training Cost Analysis",
|
| 17 |
+
"explainer": "Have the costs for data storage and model training been evaluated?",
|
| 18 |
+
"details": [
|
| 19 |
+
"Assessment of storage costs for both datasets and resulting models",
|
| 20 |
+
"Consideration of in-house vs. cloud storage options",
|
| 21 |
+
"Evaluation of training costs based on in-house GPUs or per-hour-priced instances",
|
| 22 |
+
"Analysis of cost tradeoffs considering model and dataset size",
|
| 23 |
+
"Examination of memory and tier-based pricing for storage"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"question": "Hosting and Inference Cost Evaluation",
|
| 28 |
+
"explainer": "Have the costs associated with hosting and inference been assessed?",
|
| 29 |
+
"details": [
|
| 30 |
+
"Evaluation of low-latency serving costs",
|
| 31 |
+
"Assessment of inference costs based on token usage",
|
| 32 |
+
"Consideration of factors such as initial prompt length and requested token response length",
|
| 33 |
+
"Analysis of cost variations across different languages and tokenization methods",
|
| 34 |
+
"Examination of inference volume considerations and optimization for decreased latency"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"question": "Modality-Specific Cost Analysis",
|
| 39 |
+
"explainer": "For image, video, or audio systems, have modality-specific costs been evaluated?",
|
| 40 |
+
"details": [
|
| 41 |
+
"Assessment of costs related to pixel density and frame usage for image and video",
|
| 42 |
+
"Evaluation of preprocessing costs for audio (e.g., spectrogram generation)",
|
| 43 |
+
"Consideration of model architecture in cost calculations",
|
| 44 |
+
"Analysis of inference costs specific to the modality",
|
| 45 |
+
"Examination of storage and processing requirements for different media types"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"question": "Long-term Cost Considerations",
|
| 50 |
+
"explainer": "Does the evaluation consider long-term and indirect financial costs?",
|
| 51 |
+
"details": [
|
| 52 |
+
"Assessment of pre- and post-deployment costs",
|
| 53 |
+
"Consideration of human labor and hidden costs",
|
| 54 |
+
"Tracking of changes in costs and economy of components over time",
|
| 55 |
+
"Evaluation of costs not directly tied to the system alone",
|
| 56 |
+
"Analysis of potential future cost fluctuations"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"question": "API Cost Evaluation",
|
| 61 |
+
"explainer": "For API-accessible models, has the cost structure been evaluated?",
|
| 62 |
+
"details": [
|
| 63 |
+
"Assessment of token-usage based pricing",
|
| 64 |
+
"Evaluation of cost variations based on initial prompt length and requested token response length",
|
| 65 |
+
"Analysis of cost differences across model versions",
|
| 66 |
+
"Examination of pricing structures for different types of requests",
|
| 67 |
+
"Consideration of volume discounts or tiered pricing models"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"question": "Comprehensive Cost Tracking",
|
| 72 |
+
"explainer": "Does the evaluation attempt to track and account for both direct and indirect costs, including those not immediately tied to the system?",
|
| 73 |
+
"details": [
|
| 74 |
+
"Assessment of costs related to broader infrastructure or organizational changes",
|
| 75 |
+
"Evaluation of long-term maintenance and update costs",
|
| 76 |
+
"Analysis of costs associated with complementary technologies or processes",
|
| 77 |
+
"Consideration of costs related to regulatory compliance or legal considerations"
|
| 78 |
+
]
|
| 79 |
+
}
|
| 80 |
+
]
|
| 81 |
+
}
|
scorecard_templates/privacy_data_protection.json
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Privacy and Data Protection Evaluation",
|
| 3 |
+
"questions": [
|
| 4 |
+
{
|
| 5 |
+
"question": "Data Minimization and Consent Practices",
|
| 6 |
+
"explainer": "Has the system been evaluated for its adherence to data minimization and consent practices?",
|
| 7 |
+
"details": [
|
| 8 |
+
"Implementation of data minimization practices",
|
| 9 |
+
"Use of opt-in data collection methods",
|
| 10 |
+
"Assessment of active consent for collecting, processing, and sharing data",
|
| 11 |
+
"Evaluation of compliance with privacy regulations (e.g., CCPA)",
|
| 12 |
+
"Measures for dataset transparency and accountability"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"question": "Memorization and Data Leakage Evaluation",
|
| 17 |
+
"explainer": "Has the system been assessed for unintended memorization and data leakage?",
|
| 18 |
+
"details": [
|
| 19 |
+
"Examination of the maximum amount of discoverable information given training data",
|
| 20 |
+
"Evaluation of extractable information without training data access",
|
| 21 |
+
"Analysis of out-of-distribution data revelation",
|
| 22 |
+
"Assessment of factors increasing likelihood of memorization (e.g., parameter count, sample repetitions)",
|
| 23 |
+
"Use of Membership Inference Attacks (MIA) or similar techniques"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"question": "Personal Information Revelation Assessment",
|
| 28 |
+
"explainer": "Has the system been evaluated for its potential to reveal personal or sensitive information?",
|
| 29 |
+
"details": [
|
| 30 |
+
"Direct prompting tests to reveal Personally Identifiable Information (PII)",
|
| 31 |
+
"Use of tools like ProPILE to audit PII revelation likelihood",
|
| 32 |
+
"Evaluation of the system's ability to infer personal attributes",
|
| 33 |
+
"Assessment of privacy violations based on Contextual Integrity and Theory of Mind",
|
| 34 |
+
"Analysis of the system's understanding of privacy context and purpose"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"question": "Image and Audio Privacy Evaluation",
|
| 39 |
+
"explainer": "For image and audio generation systems, has privacy been evaluated?",
|
| 40 |
+
"details": [
|
| 41 |
+
"Assessment of training data memorization in image generation",
|
| 42 |
+
"Use of adversarial Membership Inference Attacks for images",
|
| 43 |
+
"Evaluation of the proportion of generated images with high similarity to training data",
|
| 44 |
+
"Detection of memorized prompts in image generation",
|
| 45 |
+
"Scrutiny of audio generation models' ability to synthesize particular individuals' audio"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"question": "Intellectual Property and Copyright Evaluation",
|
| 50 |
+
"explainer": "Has the system been evaluated for its handling of intellectual property and copyrighted content?",
|
| 51 |
+
"details": [
|
| 52 |
+
"Assessment of the system's ability to generate copyrighted content",
|
| 53 |
+
"Evaluation of intellectual property concerns in generated content",
|
| 54 |
+
"Analysis of the system's handling of highly sensitive documents",
|
| 55 |
+
"Measures to prevent unauthorized use or reproduction of copyrighted material"
|
| 56 |
+
]
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"question": "Retroactive Privacy Protection",
|
| 60 |
+
"explainer": "Has the system been evaluated for its ability to implement retroactive privacy protections?",
|
| 61 |
+
"details": [
|
| 62 |
+
"Assessment of the system's capability to retroactively retrain in accordance with privacy policies",
|
| 63 |
+
"Evaluation of processes for removing specific data points upon request",
|
| 64 |
+
"Analysis of the system's adaptability to changing privacy regulations",
|
| 65 |
+
"Examination of the impact of data removal on model performance",
|
| 66 |
+
"Assessment of the timeframe and effectiveness of retroactive privacy measures"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"question": "Third-party Hosting Privacy Evaluation",
|
| 71 |
+
"explainer": "For third-party hosted systems, has privacy been evaluated in the context of system prompts and hidden inputs?",
|
| 72 |
+
"details": [
|
| 73 |
+
"Assessment of potential leakage of private input data in generations",
|
| 74 |
+
"Evaluation of system prompt privacy, especially for prompts containing proprietary information",
|
| 75 |
+
"Analysis of the system's handling of sensitive database records in context learning",
|
| 76 |
+
"Examination of privacy measures for prepended system prompts",
|
| 77 |
+
"Assessment of the system's ability to maintain confidentiality of hidden inputs"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"question": "Generative AI-Specific Privacy Measures",
|
| 82 |
+
"explainer": "Has the evaluation considered the challenges of applying traditional privacy protection methods to generative AI?",
|
| 83 |
+
"details": [
|
| 84 |
+
"Assessment of the applicability of data sanitization techniques to generative models",
|
| 85 |
+
"Evaluation of differential privacy approaches in the context of generative AI",
|
| 86 |
+
"Analysis of novel privacy protection methods designed specifically for generative models",
|
| 87 |
+
"Examination of the trade-offs between privacy protection and model performance in generative AI"
|
| 88 |
+
]
|
| 89 |
+
}
|
| 90 |
+
]
|
| 91 |
+
}
|