Source code for vimtk._dirty
"""
This is a module for quick and dirty ports from other codebases that I'd like
to include, but need to be cleaned up and integrated into vimtk properly.
This is a staging ground where I can use them, but also keep them separate.
Port Logic
----------
# Liberator makes porting dirty code pretty easy
import liberator
import utool as ut
lib = liberator.Liberator()
lib.add_dynamic(ut.format_multiple_paragraph_sentences)
lib.expand(['utool'])
source = lib.current_sourcecode()
print(source)
lib = liberator.Liberator()
lib.add_dynamic(ut.remove_doublspaces)
lib.add_dynamic(ut.flatten_textlines)
lib.add_dynamic(ut.get_minimum_indentation)
lib.add_dynamic(ut.regex_or)
lib.add_dynamic(ut.interleave)
lib.expand(['utool'])
source = lib.current_sourcecode()
print(source)
"""
import re
import ubelt as ub
[docs]
def regex_reconstruct_split(pattern, text):
separators = [match.group() for match in re.finditer(pattern, text)]
remaining = text
block_list = []
for sep in separators:
head, tail = remaining.split(sep, 1)
block_list.append(head)
remaining = tail
block_list.append(remaining)
return block_list, separators
[docs]
def msgblock(key, text, side='|'):
""" puts text inside a visual ascii block """
blocked_text = ''.join(
[' + --- ', key, ' ---\n'] +
[' ' + side + ' ' + line + '\n' for line in text.split('\n')] +
[' L ___ ', key, ' ___\n']
)
return blocked_text
[docs]
def get_indentation(line_):
"""
returns the number of preceding spaces
"""
return len(line_) - len(line_.lstrip())
[docs]
def get_minimum_indentation(text):
r"""
returns the number of preceding spaces
Args:
text (str): unicode text
Returns:
int: indentation
Example:
>>> # ENABLE_DOCTEST
>>> text = ' foo\n bar'
>>> result = get_minimum_indentation(text)
>>> print(result)
3
"""
lines = text.split('\n')
indentations = [get_indentation(line_)
for line_ in lines if len(line_.strip()) > 0]
if len(indentations) == 0:
return 0
return min(indentations)
[docs]
def interleave(args):
r"""
zip followed by flatten
Args:
args (tuple): tuple of lists to interleave
Example:
>>> args = ([1, 2, 3, 4, 5], ['A', 'B', 'C', 'D', 'E', 'F', 'G'])
>>> genresult = interleave(args)
>>> result = ub.repr2(list(genresult), nl=False)
>>> print(result)
[1, 'A', 2, 'B', 3, 'C', 4, 'D', 5, 'E']
"""
import itertools as it
arg_iters = list(map(iter, args))
cycle_iter = it.cycle(arg_iters)
for iter_ in cycle_iter:
try:
yield next(iter_)
except StopIteration:
return
[docs]
def format_single_paragraph_sentences(text, debug=False, myprefix=True,
sentence_break=True, max_width=73,
sepcolon=True):
r"""
helps me separatate sentences grouped in paragraphs that I have a
difficult time reading due to dyslexia
Args:
text (str):
Returns:
str: wrapped_text
Example:
>>> # DISABLE_DOCTEST
>>> text = ' lorium ipsum doloar dolar dolar dolar erata man foobar is this there yet almost man not quit ate 80 chars yet hold out almost there? dolar erat. sau.ltum. fds.fd... . . fd oob fd. list: (1) abcd, (2) foobar (4) 123456789 123456789 123456789 123456789 123 123 123 123 123456789 123 123 123 123 123456789 123456789 123456789 123456789 123456789 123 123 123 123 123 123456789 123456789 123456789 123456789 123456789 123456789 (3) spam.'
>>> #text = 'list: (1) abcd, (2) foobar (3) spam.'
>>> #text = 'foo. when: (1) there is a new individual,'
>>> #text = 'when: (1) there is a new individual,'
>>> #text = '? ? . lorium. ipsum? dolar erat. saultum. fds.fd... fd oob fd. ? ' # causes breakdown
>>> print('text = %r' % (text,))
>>> sentence_break = not ub.argflag('--nobreak')
>>> wrapped_text = format_single_paragraph_sentences(text, debug=True, sentence_break=sentence_break)
>>> result = ('wrapped_text =\n%s' % (str(wrapped_text),))
>>> print(result)
"""
import textwrap
import re
min_indent = get_minimum_indentation(text)
min_indent = (min_indent // 4) * 4
if debug:
print(colorprint(msgblock('preflat', repr(text)), 'darkyellow'))
def remove_doublspaces(text):
new_text = text
new_text = re.sub(' *', ' ', new_text)
return new_text
def flatten_textlines(text):
new_text = text
new_text = re.sub(' *\n *', ' ', new_text, flags=re.MULTILINE).strip(' ')
return new_text
text_ = remove_doublspaces(text)
# TODO: more intelligent sentence parsing
text_ = flatten_textlines(text)
if debug:
print(colorprint(msgblock('postflat', repr(text_)), 'yellow'))
raw_sep_chars = ['.', '?', '!', ':']
if not sepcolon:
raw_sep_chars.remove(':')
def split_sentences(text_):
def regex_or(list_):
return '(' + '|'.join(list_) + ')'
# TODO: rectify with split_sentences2
# SPLITS line endings based on regular expressions.
esc = re.escape
# Define separation patterns
regex_sep_chars = list(map(re.escape, raw_sep_chars))
regex_sep_prefix = [esc('(') + r'\d' + esc(')')]
regex_sep_list = regex_sep_chars + regex_sep_prefix
# Combine into a full regex
sep_pattern = regex_or(regex_sep_list)
full_pattern = '(' + sep_pattern + r'+\s)'
full_regex = re.compile(full_pattern)
# Make the splits
num_groups = full_regex.groups # num groups in the regex
split_list = re.split(full_pattern, text_)
if len(split_list) > 0:
num_bins = num_groups + 1
sentence_list = split_list[0::num_bins]
sep_list_group1 = split_list[1::num_bins]
sep_list = sep_list_group1
if debug:
print('<SPLIT DBG>')
print('num_groups = %r' % (num_groups,))
print('len(split_list) = %r' % (len(split_list)))
print('len(split_list) / len(sentence_list) = %r' % (
len(split_list) / len(sentence_list)))
print('len(sentence_list) = %r' % (len(sentence_list),))
print('len(sep_list_group1) = %r' % (len(sep_list_group1),))
#print('len(sep_list_group2) = %r' % (len(sep_list_group2),))
print('full_pattern = %s' % (full_pattern,))
#print('split_list = %r' % (split_list,))
print('sentence_list = %s' % (ub.repr2(sentence_list),))
print('sep_list = %s' % ((sep_list),))
print('</SPLIT DBG>')
return sentence_list, sep_list
def wrap_sentences(sentence_list, min_indent, max_width):
# prefix for continuations of a sentence
if myprefix:
# helps me read LaTeX
sentence_prefix = ' '
else:
sentence_prefix = ''
if text_.startswith('>>>'):
# Hack to do docstrings
# TODO: make actualy docstring reformater
sentence_prefix = '... '
if max_width is not None:
width = max_width - min_indent - len(sentence_prefix)
wrapkw = dict(width=width, break_on_hyphens=False, break_long_words=False)
#wrapped_lines_list = [textwrap.wrap(sentence_prefix + line, **wrapkw)
# for line in sentence_list]
wrapped_lines_list = []
for count, line in enumerate(sentence_list):
wrapped_lines = textwrap.wrap(line, **wrapkw)
wrapped_lines = [line_ if count == 0 else sentence_prefix + line_
for count, line_ in enumerate(wrapped_lines)]
wrapped_lines_list.append(wrapped_lines)
wrapped_sentences = ['\n'.join(line) for line in wrapped_lines_list]
else:
wrapped_sentences = sentence_list[:]
return wrapped_sentences
def rewrap_sentences2(sentence_list, sep_list):
# FIXME: probably where nl error is
# ******* #
# put the newline before or after the sep depending on if it is
# supposed to prefix or suffix the sentence.
from itertools import zip_longest
# FIXME: Place the separators either before or after a sentence
sentence_list2 = ['']
_iter = zip_longest(sentence_list, sep_list)
for count, (sentence, sep) in enumerate(_iter):
if sep is None:
sentence_list2[-1] += sentence
continue
sepchars = sep.strip()
if len(sepchars) > 0 and sepchars[0] in raw_sep_chars:
sentence_list2[-1] += sentence + (sep.strip())
sentence_list2.append('')
else:
# Place before next
sentence_list2[-1] += sentence
sentence_list2.append(sep)
sentence_list2 = [x.strip() for x in sentence_list2 if len(x.strip()) > 0]
return sentence_list2
# New way
#print('last_is_nl = %r' % (last_is_nl,))
if sentence_break:
# Break at sentences
sentence_list, sep_list = split_sentences(text_)
# FIXME: probably where nl error is
sentence_list2 = rewrap_sentences2(sentence_list, sep_list)
wrapped_sentences = wrap_sentences(sentence_list2, min_indent, max_width)
wrapped_block = '\n'.join(wrapped_sentences)
else:
# Break anywhere
width = max_width - min_indent
wrapkw = dict(width=width, break_on_hyphens=False,
break_long_words=False)
wrapped_block = '\n'.join(textwrap.wrap(text_, **wrapkw))
# HACK for last nl (seems to only happen if nl follows a seperator)
last_is_nl = text.endswith('\n') and not wrapped_block.endswith('\n')
first_is_nl = len(text) > 1 and text.startswith('\n') and not wrapped_block.startswith('\n')
# if last_is_nl and wrapped_block.strip().endswith('.'):
if last_is_nl:
wrapped_block += '\n'
if first_is_nl:
wrapped_block = '\n' + wrapped_block
# Do the final indentation
wrapped_text = ub.indent(wrapped_block, ' ' * min_indent)
return wrapped_text
[docs]
def format_multiple_paragraph_sentences(text, debug=False, **kwargs):
"""
FIXME: funky things happen when multiple newlines in the middle of
paragraphs
Example:
>>> text = ub.codeblock(
'''
Test paragraph.
Far out in the uncharted backwaters of the unfashionable end of the
western spiral arm of the Galaxy lies a small unregarded yellow sun.
Orbiting this at a distance of roughly ninety-two million miles is an
utterly insignificant little blue green planet whose ape-descended life
forms are so amazingly primitive that they still think digital watches
are a pretty neat idea.
% ---
one. two three. four.
''')
>>> #text = testdata_text(2)
>>> formated_text = format_multiple_paragraph_sentences(text, debug=True)
>>> print('+--- Text ---')
>>> print(text)
>>> print('+--- Formated Text ---')
>>> print(formated_text)
>>> print('L_____')
"""
# Hack
text = re.sub('^ *$', '', text, flags=re.MULTILINE)
if debug:
colorprint(msgblock('[fmt] text', text), 'yellow')
#print(text.replace(' ', '_'))
# Patterns that define separations between paragraphs in latex
pattern_list = [
'\n\n\n*', # newlines
#'\n\n*$', # newlines
#'^\n\n*', # newlines
#'\n\n*', # newlines
'\n? *%.*\n', # comments
# paragraph commands
'\n? *\\\\paragraph{[^}]*}\n',
# '\n? *\\\\item \\\\textbf{[^}]*}: *\n',
'\n? *\\\\item \\\\textbf{[^:]*}: *\n',
'\n? *\\\\section{[^}]*}\n',
'\n? *\\\\section{[^}]*}\\\\label{[^}]*}\n',
'\n? *\\\\section{[^}]*}\\~?\\\\label{[^}]*}\n',
'\n? *\\\\subsection{[^}]*}\\~?\\\\label{[^}]*}\n',
'\n? *\\\\subsection{[^~]*}\\~?\\\\label{[^}]*}\n',
'\n? *\\\\subsection{[^}]*}\n',
'\n? *\\\\subsubsection{[^~]*}\\~?\\\\label{[^}]*}\n',
'\n? *\\\\subsubsection{[^}]*}\n',
'\n----*\n',
'##* .*\n',
'\\.}\n',
'\\?}\n',
'\n? *\\\\newcommand{[^}]*}.*\n',
# generic multiline commands with text inside (like devcomment)
'\n? *\\\\[a-zA-Z]+{ *\n',
'\n? *\\\\begin{[^}]*}\n',
'\n? *\\\\item *\n',
'\n? *\\\\noindent *\n',
'\n? *\\\\ImageCommand[^}]*}[^}]*}{\n',
'\n? *\\\\end{[^}]*}\n?',
'\n}{',
# docstr stuff
'\n"""\n',
'\n? *Args: *\n',
#'\n? [A-Za-z_]*[0-9A-Za-z_]* (.*?) *:',
]
pattern = '|'.join(['(%s)' % (pat,) for pat in pattern_list])
# break into paragraph blocks
block_list, separators = regex_reconstruct_split(pattern, text)
collapse_pos_list = []
# Dont format things within certain block types
_iter = ub.iter_window([''] + separators + [''], 2)
for count, (block, window) in enumerate(zip(block_list, _iter)):
if (window[0].strip() == r'\begin{comment}' and
window[1].strip() == r'\end{comment}'):
collapse_pos_list.append(count)
tofmt_block_list = block_list[:]
collapse_pos_list = sorted(collapse_pos_list)[::-1]
for pos in collapse_pos_list:
collapsed_sep = (separators[pos - 1] + tofmt_block_list[pos] +
separators[pos])
separators[pos - 1] = collapsed_sep
del separators[pos]
del tofmt_block_list[pos]
if debug:
colorprint('[fmt] tofmt_block_list = ' +
ub.repr2(tofmt_block_list), 'white')
# apply formatting
formated_block_list = []
for block in tofmt_block_list:
fmtblock = format_single_paragraph_sentences(
block, debug=debug, **kwargs)
formated_block_list.append(fmtblock)
rejoined_list = list(interleave((formated_block_list, separators)))
if debug:
colorprint('[fmt] formated_block_list = ' +
ub.repr2(formated_block_list), 'turquoise')
formated_text = ''.join(rejoined_list)
return formated_text