Example of Labeling Functions
This is our collection of labeling functions for both row labeling and span labeling
Labeling functions are designed to apply weak heuristics and rules for predicting labels on unlabeled data. These can be based on expert knowledge or other labeling models. The labeling functions should be coded in Python and can include:
Keyword searches with regular expressions. For instance, detecting severity scores in both numeric and Roman numeral formats.
Advanced preprocessing models using libraries like NLTK, Spacy, or TextBlob, which include POS tagging, sentiment analysis, NER, dependency parsing, syntax tree creation, stop words list, similarity measures, etc.
For Row Labeling
Openai ChatGPT sports
import re
from stegosaurus.annotator import target_label
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
# Labeling function definition (Start editing here!)
import openai
openai.api_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['sports'])
def label_function(sample):
text = list(sample.values())[0]
# Implement your logic here
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are an Expert Data Labeler in classifying text as sports by detecting match score"},
{
"role": "user",
"content": """Sentence: Treat Huey, the two-time All-Met Player of the year, suffered a 6-2, 6-0 loss to 29-year-old journeyman Mashiska Washington, the younger brother of former pro and 1996 Wimbledon finalist Malivai.
What is the match score from sentence above? Is the text above classified as sports?"""},
{"role": "assistant", "content": "[6-2,6-0];yes"},
{
"role": "user",
"content": """Sentence: Pay for the Washington area's top executives rose significantly last year, reversing the downward trend that set in with the recession in 2001.
What is the match score from sentence above? Is the text above classified as sports?"""},
{"role": "assistant", "content": '[];no'},
{
"role": "user",
"content": """Sentence: {}
What is the match score from sentence above? Is the text above classified as sports?""".format(text)}
]
)
match_text = completion['choices'][0]['message']['content']
#match_text is string
#process match_text to see if the result sufficient the logic of labeling function or not
detected_score, is_sport = match_text.split(';')
if detected_score!='[]' and is_sport=='yes':
return True
return FalseKeyword Search world
import re
from stegosaurus.annotator import target_label
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['world'])
def label_function(sample):
text = list(sample.values())[0]
# Implement your logic here
TARGET_KEYWORDS = ['confrontation', 'violent', 'harrassed','fight', 'vehicle', 'government', 'employment', 'military', 'war']
for keyword in TARGET_KEYWORDS:
keyword = keyword.replace("\\\\", '')
if re.search(keyword, text, re.IGNORECASE):
return True
return FalseRegex rule sports
import re
from stegosaurus.annotator import target_label
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['sports'])
def label_function(sample):
text = list(sample.values())[0]
# Implement your logic here
score = re.compile(r"\\b(0|[1-9]\\d*)-(0|[1-9]\\d*)\\b")
PATTERNS = [score]
for pattern in PATTERNS:
if re.search(pattern, text):
return True
return FalseSpacy world
import re
from stegosaurus.annotator import target_label
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
# Labeling function definition (Start editing here!)
import spacy
nlp = spacy.load("en_core_web_sm")
NER_LABELS = ["NORP"]
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['world'])
def label_function(sample):
text = list(sample.values())[0]
# Implement your logic here
spacy_pred = nlp(text)
TARGET_KEYWORDS = []
for token in spacy_pred.ents:
token_label = token.label_
if token_label in NER_LABELS:
TARGET_KEYWORDS.append(str(token))
for keyword in TARGET_KEYWORDS:
keyword = keyword.replace("\\\\", '')
if re.search(keyword, text, re.IGNORECASE):
return True
return FalseFor Span Labeling
Last updated