|     IEEE Xplore Digital Library     |     IEEE Standards     |     IEEE Spectrum     |     More Sites

Verified Commit 284d931f authored by Emi Simpson's avatar Emi Simpson
Browse files

Merge remote-tracking branch 'cleaner/main' into main

parents 24e08af9 041c974d
/* Modifications added automatically by LaTex cleaner */
.footnote-ref {
color: blue;
vertical-align: super;
font-size: 0.6rem;
text-decoration: none;
.footnote {
font-size: 0.7em;
from typing import Optional
from bs4 import BeautifulSoup
from bs4.element import PageElement, Tag
def assert_is_tag_after_find(element: PageElement) -> Tag:
assert isinstance(element, Tag), 'Unexpected: Find called with a class returned '\
'something other than a tag looking for a class'
return element
def find_tag(base: Tag, class_: str) -> Optional[Tag]:
result = base.find(class_ = class_)
if result is None:
return result
return assert_is_tag_after_find(result)
soup = BeautifulSoup(open('index.html'), 'html.parser')
# Find the main element
ltx_main = find_tag(soup, 'ltx_page_main')
assert ltx_main is not None, "Bad HTML: LaTeX page doesn't contain a main element "\
"(ltx_page_main). Are you sure this is a LaTeX html page "\
"rendered with tex2html?"
# Add a new section to contain the footnotes
footnote_section = soup.new_tag('section')
for footnote_wrapper in soup.find_all(class_ = 'ltx_note'):
footnote_wrapper = assert_is_tag_after_find(footnote_wrapper)
# Find the content element (that's the box with the text that the footnote refers to)
note_content = find_tag(footnote_wrapper, 'ltx_note_content')
assert note_content is not None, 'Bad HTML: Footnote exists without any content '\
'element (ltx_note_content)'
# Find the mark in the content and remove it. We'll replace it with our own later
note_mark = find_tag(note_content, class_='ltx_note_mark')
assert note_mark is not None, 'Bad HTML: ltx_note (footnote) object without an ' \
'ltx_note_mark (superscript number denoting the ' \
'footnote number) present in the note content '\
note_mark = note_mark.extract()
# Find the note number
note_number_raw = note_mark.string
assert note_number_raw is not None, 'Bad HTML: ltx_note_mark is present in a ' \
'footnote, but is empty'
note_number = int(note_number_raw)
except ValueError:
raise RuntimeError('Bad HTML: ltx_note_mark is present in a footnote, but '\
'contains a non-numeric footnote number')
# Replace the old footnote with just a reference to the new footnote
new_reference = soup.new_tag(
new_reference['class'] = 'footnote-ref'
new_reference.string = str(note_number)
footnote_wrapper = footnote_wrapper.extract()
# Build the actual footnote the reference is pointing to
new_content = soup.new_tag('p', id = f'fn-{note_number}')
new_content['class'] = 'footnote'
backlink = soup.new_tag(
backlink['class'] = 'footnote-ref'
backlink.string = str(note_number)
# Save the new soup
with open('cleaned.html', 'w') as output_file:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment