Source code for vimtk._dirty

"""
This is a module for quick and dirty ports from other codebases that I'd like
to include, but need to be cleaned up and integrated into vimtk properly.

This is a staging ground where I can use them, but also keep them separate.


Port Logic
----------
# Liberator makes porting dirty code pretty easy

import liberator
import utool as ut

lib = liberator.Liberator()
lib.add_dynamic(ut.format_multiple_paragraph_sentences)
lib.expand(['utool'])
source = lib.current_sourcecode()
print(source)


lib = liberator.Liberator()
lib.add_dynamic(ut.remove_doublspaces)
lib.add_dynamic(ut.flatten_textlines)
lib.add_dynamic(ut.get_minimum_indentation)
lib.add_dynamic(ut.regex_or)
lib.add_dynamic(ut.interleave)
lib.expand(['utool'])
source = lib.current_sourcecode()
print(source)

"""
import re
import ubelt as ub


[docs] def regex_reconstruct_split(pattern, text): separators = [match.group() for match in re.finditer(pattern, text)] remaining = text block_list = [] for sep in separators: head, tail = remaining.split(sep, 1) block_list.append(head) remaining = tail block_list.append(remaining) return block_list, separators
[docs] def msgblock(key, text, side='|'): """ puts text inside a visual ascii block """ blocked_text = ''.join( [' + --- ', key, ' ---\n'] + [' ' + side + ' ' + line + '\n' for line in text.split('\n')] + [' L ___ ', key, ' ___\n'] ) return blocked_text
[docs] def get_indentation(line_): """ returns the number of preceding spaces """ return len(line_) - len(line_.lstrip())
[docs] def get_minimum_indentation(text): r""" returns the number of preceding spaces Args: text (str): unicode text Returns: int: indentation Example: >>> # ENABLE_DOCTEST >>> text = ' foo\n bar' >>> result = get_minimum_indentation(text) >>> print(result) 3 """ lines = text.split('\n') indentations = [get_indentation(line_) for line_ in lines if len(line_.strip()) > 0] if len(indentations) == 0: return 0 return min(indentations)
[docs] def interleave(args): r""" zip followed by flatten Args: args (tuple): tuple of lists to interleave Example: >>> args = ([1, 2, 3, 4, 5], ['A', 'B', 'C', 'D', 'E', 'F', 'G']) >>> genresult = interleave(args) >>> result = ub.repr2(list(genresult), nl=False) >>> print(result) [1, 'A', 2, 'B', 3, 'C', 4, 'D', 5, 'E'] """ import itertools as it arg_iters = list(map(iter, args)) cycle_iter = it.cycle(arg_iters) for iter_ in cycle_iter: try: yield next(iter_) except StopIteration: return
[docs] def colorprint(text, color): print(ub.color_text(text, color))
[docs] def format_single_paragraph_sentences(text, debug=False, myprefix=True, sentence_break=True, max_width=73, sepcolon=True): r""" helps me separatate sentences grouped in paragraphs that I have a difficult time reading due to dyslexia Args: text (str): Returns: str: wrapped_text Example: >>> # DISABLE_DOCTEST >>> text = ' lorium ipsum doloar dolar dolar dolar erata man foobar is this there yet almost man not quit ate 80 chars yet hold out almost there? dolar erat. sau.ltum. fds.fd... . . fd oob fd. list: (1) abcd, (2) foobar (4) 123456789 123456789 123456789 123456789 123 123 123 123 123456789 123 123 123 123 123456789 123456789 123456789 123456789 123456789 123 123 123 123 123 123456789 123456789 123456789 123456789 123456789 123456789 (3) spam.' >>> #text = 'list: (1) abcd, (2) foobar (3) spam.' >>> #text = 'foo. when: (1) there is a new individual,' >>> #text = 'when: (1) there is a new individual,' >>> #text = '? ? . lorium. ipsum? dolar erat. saultum. fds.fd... fd oob fd. ? ' # causes breakdown >>> print('text = %r' % (text,)) >>> sentence_break = not ub.argflag('--nobreak') >>> wrapped_text = format_single_paragraph_sentences(text, debug=True, sentence_break=sentence_break) >>> result = ('wrapped_text =\n%s' % (str(wrapped_text),)) >>> print(result) """ import textwrap import re min_indent = get_minimum_indentation(text) min_indent = (min_indent // 4) * 4 if debug: print(colorprint(msgblock('preflat', repr(text)), 'darkyellow')) def remove_doublspaces(text): new_text = text new_text = re.sub(' *', ' ', new_text) return new_text def flatten_textlines(text): new_text = text new_text = re.sub(' *\n *', ' ', new_text, flags=re.MULTILINE).strip(' ') return new_text text_ = remove_doublspaces(text) # TODO: more intelligent sentence parsing text_ = flatten_textlines(text) if debug: print(colorprint(msgblock('postflat', repr(text_)), 'yellow')) raw_sep_chars = ['.', '?', '!', ':'] if not sepcolon: raw_sep_chars.remove(':') def split_sentences(text_): def regex_or(list_): return '(' + '|'.join(list_) + ')' # TODO: rectify with split_sentences2 # SPLITS line endings based on regular expressions. esc = re.escape # Define separation patterns regex_sep_chars = list(map(re.escape, raw_sep_chars)) regex_sep_prefix = [esc('(') + r'\d' + esc(')')] regex_sep_list = regex_sep_chars + regex_sep_prefix # Combine into a full regex sep_pattern = regex_or(regex_sep_list) full_pattern = '(' + sep_pattern + r'+\s)' full_regex = re.compile(full_pattern) # Make the splits num_groups = full_regex.groups # num groups in the regex split_list = re.split(full_pattern, text_) if len(split_list) > 0: num_bins = num_groups + 1 sentence_list = split_list[0::num_bins] sep_list_group1 = split_list[1::num_bins] sep_list = sep_list_group1 if debug: print('<SPLIT DBG>') print('num_groups = %r' % (num_groups,)) print('len(split_list) = %r' % (len(split_list))) print('len(split_list) / len(sentence_list) = %r' % ( len(split_list) / len(sentence_list))) print('len(sentence_list) = %r' % (len(sentence_list),)) print('len(sep_list_group1) = %r' % (len(sep_list_group1),)) #print('len(sep_list_group2) = %r' % (len(sep_list_group2),)) print('full_pattern = %s' % (full_pattern,)) #print('split_list = %r' % (split_list,)) print('sentence_list = %s' % (ub.repr2(sentence_list),)) print('sep_list = %s' % ((sep_list),)) print('</SPLIT DBG>') return sentence_list, sep_list def wrap_sentences(sentence_list, min_indent, max_width): # prefix for continuations of a sentence if myprefix: # helps me read LaTeX sentence_prefix = ' ' else: sentence_prefix = '' if text_.startswith('>>>'): # Hack to do docstrings # TODO: make actualy docstring reformater sentence_prefix = '... ' if max_width is not None: width = max_width - min_indent - len(sentence_prefix) wrapkw = dict(width=width, break_on_hyphens=False, break_long_words=False) #wrapped_lines_list = [textwrap.wrap(sentence_prefix + line, **wrapkw) # for line in sentence_list] wrapped_lines_list = [] for count, line in enumerate(sentence_list): wrapped_lines = textwrap.wrap(line, **wrapkw) wrapped_lines = [line_ if count == 0 else sentence_prefix + line_ for count, line_ in enumerate(wrapped_lines)] wrapped_lines_list.append(wrapped_lines) wrapped_sentences = ['\n'.join(line) for line in wrapped_lines_list] else: wrapped_sentences = sentence_list[:] return wrapped_sentences def rewrap_sentences2(sentence_list, sep_list): # FIXME: probably where nl error is # ******* # # put the newline before or after the sep depending on if it is # supposed to prefix or suffix the sentence. from itertools import zip_longest # FIXME: Place the separators either before or after a sentence sentence_list2 = [''] _iter = zip_longest(sentence_list, sep_list) for count, (sentence, sep) in enumerate(_iter): if sep is None: sentence_list2[-1] += sentence continue sepchars = sep.strip() if len(sepchars) > 0 and sepchars[0] in raw_sep_chars: sentence_list2[-1] += sentence + (sep.strip()) sentence_list2.append('') else: # Place before next sentence_list2[-1] += sentence sentence_list2.append(sep) sentence_list2 = [x.strip() for x in sentence_list2 if len(x.strip()) > 0] return sentence_list2 # New way #print('last_is_nl = %r' % (last_is_nl,)) if sentence_break: # Break at sentences sentence_list, sep_list = split_sentences(text_) # FIXME: probably where nl error is sentence_list2 = rewrap_sentences2(sentence_list, sep_list) wrapped_sentences = wrap_sentences(sentence_list2, min_indent, max_width) wrapped_block = '\n'.join(wrapped_sentences) else: # Break anywhere width = max_width - min_indent wrapkw = dict(width=width, break_on_hyphens=False, break_long_words=False) wrapped_block = '\n'.join(textwrap.wrap(text_, **wrapkw)) # HACK for last nl (seems to only happen if nl follows a seperator) last_is_nl = text.endswith('\n') and not wrapped_block.endswith('\n') first_is_nl = len(text) > 1 and text.startswith('\n') and not wrapped_block.startswith('\n') # if last_is_nl and wrapped_block.strip().endswith('.'): if last_is_nl: wrapped_block += '\n' if first_is_nl: wrapped_block = '\n' + wrapped_block # Do the final indentation wrapped_text = ub.indent(wrapped_block, ' ' * min_indent) return wrapped_text
[docs] def format_multiple_paragraph_sentences(text, debug=False, **kwargs): """ FIXME: funky things happen when multiple newlines in the middle of paragraphs Example: >>> text = ub.codeblock( ''' Test paragraph. Far out in the uncharted backwaters of the unfashionable end of the western spiral arm of the Galaxy lies a small unregarded yellow sun. Orbiting this at a distance of roughly ninety-two million miles is an utterly insignificant little blue green planet whose ape-descended life forms are so amazingly primitive that they still think digital watches are a pretty neat idea. % --- one. two three. four. ''') >>> #text = testdata_text(2) >>> formated_text = format_multiple_paragraph_sentences(text, debug=True) >>> print('+--- Text ---') >>> print(text) >>> print('+--- Formated Text ---') >>> print(formated_text) >>> print('L_____') """ # Hack text = re.sub('^ *$', '', text, flags=re.MULTILINE) if debug: colorprint(msgblock('[fmt] text', text), 'yellow') #print(text.replace(' ', '_')) # Patterns that define separations between paragraphs in latex pattern_list = [ '\n\n\n*', # newlines #'\n\n*$', # newlines #'^\n\n*', # newlines #'\n\n*', # newlines '\n? *%.*\n', # comments # paragraph commands '\n? *\\\\paragraph{[^}]*}\n', # '\n? *\\\\item \\\\textbf{[^}]*}: *\n', '\n? *\\\\item \\\\textbf{[^:]*}: *\n', '\n? *\\\\section{[^}]*}\n', '\n? *\\\\section{[^}]*}\\\\label{[^}]*}\n', '\n? *\\\\section{[^}]*}\\~?\\\\label{[^}]*}\n', '\n? *\\\\subsection{[^}]*}\\~?\\\\label{[^}]*}\n', '\n? *\\\\subsection{[^~]*}\\~?\\\\label{[^}]*}\n', '\n? *\\\\subsection{[^}]*}\n', '\n? *\\\\subsubsection{[^~]*}\\~?\\\\label{[^}]*}\n', '\n? *\\\\subsubsection{[^}]*}\n', '\n----*\n', '##* .*\n', '\\.}\n', '\\?}\n', '\n? *\\\\newcommand{[^}]*}.*\n', # generic multiline commands with text inside (like devcomment) '\n? *\\\\[a-zA-Z]+{ *\n', '\n? *\\\\begin{[^}]*}\n', '\n? *\\\\item *\n', '\n? *\\\\noindent *\n', '\n? *\\\\ImageCommand[^}]*}[^}]*}{\n', '\n? *\\\\end{[^}]*}\n?', '\n}{', # docstr stuff '\n"""\n', '\n? *Args: *\n', #'\n? [A-Za-z_]*[0-9A-Za-z_]* (.*?) *:', ] pattern = '|'.join(['(%s)' % (pat,) for pat in pattern_list]) # break into paragraph blocks block_list, separators = regex_reconstruct_split(pattern, text) collapse_pos_list = [] # Dont format things within certain block types _iter = ub.iter_window([''] + separators + [''], 2) for count, (block, window) in enumerate(zip(block_list, _iter)): if (window[0].strip() == r'\begin{comment}' and window[1].strip() == r'\end{comment}'): collapse_pos_list.append(count) tofmt_block_list = block_list[:] collapse_pos_list = sorted(collapse_pos_list)[::-1] for pos in collapse_pos_list: collapsed_sep = (separators[pos - 1] + tofmt_block_list[pos] + separators[pos]) separators[pos - 1] = collapsed_sep del separators[pos] del tofmt_block_list[pos] if debug: colorprint('[fmt] tofmt_block_list = ' + ub.repr2(tofmt_block_list), 'white') # apply formatting formated_block_list = [] for block in tofmt_block_list: fmtblock = format_single_paragraph_sentences( block, debug=debug, **kwargs) formated_block_list.append(fmtblock) rejoined_list = list(interleave((formated_block_list, separators))) if debug: colorprint('[fmt] formated_block_list = ' + ub.repr2(formated_block_list), 'turquoise') formated_text = ''.join(rejoined_list) return formated_text