-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_labelled.py
More file actions
79 lines (65 loc) · 2.95 KB
/
extract_labelled.py
File metadata and controls
79 lines (65 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
""" Extract and count unique labelled text spans from a label studio exported json file. """
__author__ = "Ray Stefancsik"
__version__ = "0.1"
######################################################################
# Import modules
######################################################################
import argparse # command-line parsing module from the Python standard library
import json # serialize, de-serialize, etc., JSON
######################################################################
# Obtain user input
######################################################################
### Mandatory positional arguments
parser = argparse.ArgumentParser( description='Extract and count unique labelled text spans from a label studio exported json file.', formatter_class=argparse.RawTextHelpFormatter )
parser.add_argument( 'input_file', help='path to your input file' )
### Optional arguments:
parser.add_argument('-p', '--pmcid', help='provide an identifier for yourdata file', nargs='?', default="use_filename_as_id", const="use_filename_as_id" ) # you need both the default and const for different scenarios (1. no flag and
# no user input, 2. flag only without any other user input, 3. flag plus user
# input. see https://docs.python.org/3/library/argparse.html
args = parser.parse_args()
# input filename
fname = args.input_file
pmcid = args.pmcid
if args.pmcid == "use_filename_as_id":
pmcid = fname
######################################################################
# Read in json data
######################################################################
with open(fname, mode="r", encoding="utf-8") as input_file:
json_data = json.load(input_file)
######################################################################
# parse json data
######################################################################
# create an empty list for the results
results = []
# extract selected fields of interest
### strip whitespace from labelled text span
for i in json_data:
for j in i["annotations"]:
for k in j["result"]:
for l in k["value"]["labels"]:
# print(f'{pmcid}\t{l}\t"{k["value"]["text"]}"' ) # use double quote marks to surround text spans
results.append('\t'.join([l, k["value"]["text"].strip()]))
# create an empty dictionary for counting unique text spans
counts = {}
# count unique label/text pairs
for r in results:
if r not in counts:
counts[r] = 1
else:
counts[r] += 1
# test print unsorted counts:
#for key in counts:
# print(f'{pmcid}\t{key}\t{counts[key]}')
# sort counts
sorted_counts = sorted(counts.items())
######################################################################
# print sorted results
######################################################################
# print headers
print(f'PUB_ID\tTEXT\tLABEL\tCOUNT')
# print sorted results as tab separated fields
for e in sorted_counts:
print(f'{pmcid}\t{e[0]}\t{e[1]}')