Example of Labeling Functions

This is our collection of labeling functions for both row labeling and span labeling

Labeling functions are designed to apply weak heuristics and rules for predicting labels on unlabeled data. These can be based on expert knowledge or other labeling models. The labeling functions should be coded in Python and can include:

  • Keyword searches with regular expressions. For instance, detecting severity scores in both numeric and Roman numeral formats.

  • Advanced preprocessing models using libraries like NLTK, Spacy, or TextBlob, which include POS tagging, sentiment analysis, NER, dependency parsing, syntax tree creation, stop words list, similarity measures, etc.

For Row Labeling

Openai ChatGPT sports
import re
from stegosaurus.annotator import target_label

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'business' : 0,
  'science' : 1,
  'sports' : 2,
  'world' : 3
}

# Labeling function definition (Start editing here!)
import openai
openai.api_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['sports'])
def label_function(sample):
  text = list(sample.values())[0]

  # Implement your logic here
  completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "You are an Expert Data Labeler in classifying text as sports by detecting match score"},
      {
        "role": "user", 
        "content": """Sentence: Treat Huey, the two-time All-Met Player of the year, suffered a 6-2, 6-0 loss to 29-year-old journeyman Mashiska Washington, the younger brother of former pro and 1996 Wimbledon finalist Malivai.

          What is the match score from sentence above? Is the text above classified as sports?"""},
      {"role": "assistant", "content": "[6-2,6-0];yes"},
      {
        "role": "user", 
        "content": """Sentence: Pay for the Washington area's top executives rose significantly last year, reversing the downward trend that set in with the recession in 2001.

          What is the match score from sentence above? Is the text above classified as sports?"""},
      {"role": "assistant", "content": '[];no'},
      {
        "role": "user", 
        "content": """Sentence: {}
          
          What is the match score from sentence above? Is the text above classified as sports?""".format(text)}
    ]
  )
  match_text = completion['choices'][0]['message']['content']

	#match_text is string
	#process match_text to see if the result sufficient the logic of labeling function or not
  detected_score, is_sport = match_text.split(';')
  if detected_score!='[]' and is_sport=='yes':
    return True
  return False
Keyword Search world
import re
from stegosaurus.annotator import target_label

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'business' : 0, 
  'science' : 1, 
  'sports' : 2, 
  'world' : 3
}

# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['world'])
def label_function(sample):
  
  text = list(sample.values())[0]

  # Implement your logic here
  TARGET_KEYWORDS = ['confrontation', 'violent', 'harrassed','fight', 'vehicle', 'government', 'employment', 'military', 'war']
  for keyword in TARGET_KEYWORDS:
    keyword = keyword.replace("\\\\", '')
    if re.search(keyword, text, re.IGNORECASE):
      return True
  return False
Regex rule sports
import re
from stegosaurus.annotator import target_label

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'business' : 0, 
  'science' : 1, 
  'sports' : 2, 
  'world' : 3
}

# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['sports'])
def label_function(sample):
  
  text = list(sample.values())[0]

  # Implement your logic here
  score = re.compile(r"\\b(0|[1-9]\\d*)-(0|[1-9]\\d*)\\b")
  PATTERNS = [score]
  for pattern in PATTERNS:
    if re.search(pattern, text):
      return True
  return False
Spacy world
import re
from stegosaurus.annotator import target_label

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'business' : 0, 
  'science' : 1, 
  'sports' : 2, 
  'world' : 3
}

# Labeling function definition (Start editing here!)
import spacy
nlp = spacy.load("en_core_web_sm")
NER_LABELS = ["NORP"]

# Assign target label based on LABELS dictionary
@target_label(label=LABELS['world'])
def label_function(sample):
  
  text = list(sample.values())[0]

  # Implement your logic here
  spacy_pred = nlp(text)
  TARGET_KEYWORDS = []
  for token in spacy_pred.ents:
    token_label = token.label_
    if token_label in NER_LABELS:
      TARGET_KEYWORDS.append(str(token))

  for keyword in TARGET_KEYWORDS:
    keyword = keyword.replace("\\\\", '')
    if re.search(keyword, text, re.IGNORECASE):
      return True
  return False
Textblob (for sentiment analysis) positive
import re
from stegosaurus.annotator import target_label

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'positive' : 0, 
  'negative' : 1, 
}

# Labeling function definition (Start editing here!)
from textblob import TextBlob
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['positive'])
def label_function(sample):
  
  text = list(sample.values())[0]

  # Implement your logic here
  scores = TextBlob(text)
  polarity = scores.sentiment.polarity

  if polarity > 0:
    return True
  return False

For Span Labeling

Openai ChatGPT event
import re
from stegosaurus.annotator import target_label

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'art' : 0, 
  'event' : 1, 
  'geo' : 2, 
  'gpe' : 3, 
  'nat' : 4, 
  'org' : 5, 
  'per' : 6, 
  'tim' : 7
}

# Labeling function definition (Start editing here!)
import openai
openai.api_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

# Assign target label based on LABELS dictionary
@target_label(label=LABELS['event'])
def label_function(sample):
  
  text = list(sample.values())[0]

  # Implement your logic here
  completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "You are an Expert Data Labeler in classifying sports event"},
      {
        "role": "user", 
        "content": """Sentence: The former Soviet republic was playing in an Asian Cup finals tie for the first time .

          What is the sports event from the sentence above?"""},
      {"role": "assistant", "content": str(["Asian Cup"])},
      {
        "role": "user", 
        "content": """Sentence: 68 , Trevor Dodds ( Namibia ) 72 69 142 Don Robertson ( U.S. ) 73 69 , Dion Fourie 69 73 ,

          What is the sports event from the sentence above?"""},
      {"role": "assistant", "content": '[]'},
      {
        "role": "user", 
        "content": """Sentence: {}
          
          What is the sports event from the sentence above?""".format(text)}
    ]
  )

  match_text = completion['choices'][0]['message']['content']
	
	#match_text is string
	import ast #to convert OpenAI string output to list
  
	TARGET_KEYWORDS = ast.literal_eval(match_text)
  match_list = [re.finditer(re.escape(target), text, re.IGNORECASE) for target in TARGET_KEYWORDS]

  return match_list
Regex rule police name
import re
from stegosaurus.annotator import target_label
from typing import List

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'address' : 0,
  'crime/situation' : 1,
  'police officer name' : 2
}

# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['police officer name'])
def label_function(sample) -> List:
  text = list(sample.values())[0]

  # Implement your logic here
  keyword_list = ['Officer', 'Lieutenant', 'Sergeant']

  TARGET_KEYWORDS = []
  for keyword in keyword_list:
    pattern = re.compile('(?<={}\\s)[A-Z][a-z]*(?:\\s+[A-Z][a-z]*)*'.format(keyword)) #[A-Z][a-z]+(?: [A-Z][a-z]+){0,2}
    TARGET_KEYWORDS.append(pattern)

  match_list = [re.finditer(target, text) for target in TARGET_KEYWORDS]

  return match_list
Keyword Prefix suspect name
import re
from stegosaurus.annotator import target_label
from typing import List

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'address' : 0,
  'crime/situation' : 1,
  'suspect name' : 2,
  'witness' : 3
}

# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['suspect name'])
def label_function(sample) -> List:
  text = list(sample.values())[0]

  # Implement your logic here
  keyword_prefix = 'Suspect:'

  TARGET_KEYWORDS = []
  if text[:len(keyword_prefix)] == keyword_prefix:
    TARGET_KEYWORDS = [text.replace(keyword_prefix, '')]

  match_list = [re.finditer(target, text) for target in TARGET_KEYWORDS]

  return match_list
Keyword Match slang label

import re
from stegosaurus.annotator import target_label
from typing import List

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'slang label' : 0
}

# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['slang label'])
def label_function(sample) -> List:
  text = list(sample.values())[0]

  # Implement your logic here

  ## define all slang collections
  slang_vocab = ["kiddo", "cuz", "havin", "wanna", 'lit', 'slay', 'sus', 'buck', 'whip', 'nuts', 'flaky','chill', 'lemme', 'gimme']
  slang_abrev = ['4ao', 'a3', 'aamof', 'abt', 'acct', 'adih', 'adn', 'afaic', 'afaict', 'afaik', 'afair', 'afk', 'asap', 'asl', 'atk', 'ave.', 'aymm', 'ayor', 'b&b', 'b+b', 'b2b', 'b2c', 'b4', 'b4n', 'b@u', 'bae', 'bak', 'bbbg', 'bbias', 'bbl', 'bbs', 'bc', 'be4', 'bfn', 'blvd', 'bout', 'brb', 'bros', 'bro', 'brt', 'bsaaw', 'bt', 'btw', 'bwl', 'c/o', 'cet', 'cf', 'csl', 'cu', 'cul8r', 'cwot', 'cya', 'cyt', 'dae', 'dbmib', 'diy', 'dm', 'dwh', 'e123', 'eet', 'eg', 'embm', 'encl', 'encl.', 'etc', 'faq', 'fawc', 'fc', 'fig', 'fimh', 'ft', 'ftl', 'ftw', 'fwiw', 'fyi', 'g9', 'gahoy', 'gal', 'gcse', 'gf', 'gfn', 'gg', 'gl', 'glhf', 'gmt', 'gmta', 'gn', 'g.o.a.t', 'goat', 'goi', 'gr8', 'gratz', 'gyal', 'h&c', 'hp', 'hr', 'hrh', 'ht', 'ibrb', 'ic', 'icq', 'icymi', 'idc', 'idgadf', 'idgaf', 'idk', 'ie', 'i.e', 'ifyp', 'IG', 'iirc', 'ik', 'ilu', 'ily', 'ima', 'imho', 'imo', 'imu', 'iow', 'irl', 'j4f', 'jic', 'jk', 'jp', 'js', 'jsyk', 'kms', 'l8r', 'lb', 'lbs', 'ldr', 'lmao', 'lmfao', 'lol', 'ltd', 'ltns', 'm8', 'mf', 'mfs', 'mfw', 'mofo', 'mph', 'mrw', 'mte', 'nagi', 'nbc', 'nbd', 'nfs', 'ngl', 'nhs', 'nit', 'npr', 'nrn', 'nsfl', 'nsfw', 'nth', 'nvr', 'nyc', 'oc', 'og', 'ohp', 'oic', 'omdb', 'omg', 'omw', 'pfft', 'poc', 'pov', 'pp', 'ppl', 'prw', 'ps', 'pt', 'ptb', 'pto', 'qpsa', 'ratchet', 'rbtl', 'rlrt', 'rn', 'rofl', 'roflol', 'rotflmao', 'rt', 'ruok', 'sfw', 'sk8', 'smh', 'sq', 'srsly', 'ssdd', 'tbh', 'tbs', 'tbsp', 'tf', 'tfw', 'thks', 'tho', 'thx', 'tia', 'til', 'tl;dr', 'tlg', 'tldr', 'tmb', 'tntl', 'tt', 'ttyl', 'u', 'ur', 'u2', 'u4e', 'utc', 'w/', 'w/o', 'w8', 'wassup', 'wb', 'wk', 'wrd', 'wtf', 'wtg', 'wtpa', 'wtv', 'wuf', 'wuzup', 'wywh', 'y', 'yd', 'ygtr', 'ynk', 'zzz', '&#39']
  
  # include suitable abbreviation/slang that meet the requirements as our target keywords
  TARGET_KEYWORDS = slang_vocab + slang_abrev

  # preprocess text to replace url as '====' because it'll be included as slang/abbrev by regex
  text_list = text.split(' ')
  prep_text_list = [token if 'http' not in token and 'www' not in token else '='*len(token) for token in text_list]
  prep_text = ' '.join(prep_text_list)

  # collect defined keywords in text as 'Slang label'
  match_list_1 = [re.finditer(r'\\b{}\\b'.format(target), prep_text, re.IGNORECASE) for target in TARGET_KEYWORDS]
  
  # add another pattern (ex: 20ish, 30ish, etc) as 'Slang label'
  match_list_2 = [re.finditer(r'\\d+ish', prep_text, re.IGNORECASE)]

  #collect all matched keywords and patterns that considered as 'Slang label'
  match_list = match_list_1 + match_list_2
  return match_list
Spacy time
import re
from stegosaurus.annotator import target_label

# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
  'art' : 0, 
  'eve' : 1, 
  'geo' : 2, 
  'gpe' : 3, 
  'nat' : 4, 
  'org' : 5, 
  'per' : 6, 
  'time' : 7
}

# Labeling function definition (Start editing here!)
import spacy
nlp = spacy.load("en_core_web_sm")
NER_LABELS = ['TIME']

# Assign target label based on LABELS dictionary
@target_label(label=LABELS['time'])
def label_function(sample):
  
  text = list(sample.values())[0]

  # Implement your logic here
  spacy_pred = nlp(text)
  TARGET_KEYWORDS = []
  for token in spacy_pred.ents:
    token_label = token.label_
    if token_label in NER_LABELS:
      TARGET_KEYWORDS.append(re.escape(str(token)))
        
  match_list = [re.finditer(target, text) for target in TARGET_KEYWORDS] 
  return match_list

Last updated