Skip to content

Commit

Permalink
feat: add printable text
Browse files Browse the repository at this point in the history
  • Loading branch information
CFisicaro committed Mar 4, 2022
1 parent 04270b5 commit 2f2379c
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
2 changes: 0 additions & 2 deletions preprocessing/bertPrep.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
import ProteinSharding

import argparse
import itertools
import multiprocessing
import os
import pprint
import subprocess
Expand Down
22 changes: 22 additions & 0 deletions preprocessing/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,28 @@ def load_vocab(vocab_file):
return vocab


def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""

# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")

class ProteoNeMoTokenizer(object):
"""Runs end-to-end tokenization:"""

Expand Down

0 comments on commit 2f2379c

Please sign in to comment.