⚝
One Hat Cyber Team
⚝
Your IP:
216.73.216.118
Server IP:
84.32.84.115
Server:
Linux sg-nme-web1518.main-hosting.eu 5.14.0-611.16.1.el9_7.x86_64 #1 SMP PREEMPT_DYNAMIC Mon Dec 22 03:40:39 EST 2025 x86_64
Server Software:
LiteSpeed
PHP Version:
8.3.28
Buat File
|
Buat Folder
Eksekusi
Dir :
~
/
opt
/
gsutil
/
third_party
/
pyparsing
/
examples
/
View File Name :
html_stripper.py
# # html_stripper.py # # Sample code for stripping HTML markup tags and scripts from # HTML source files. # # Copyright (c) 2006, 2016, 2023, Paul McGuire # from urllib.request import urlopen from pyparsing import ( LineEnd, quoted_string, make_html_tags, common_html_entity, replace_html_entity, html_comment, any_open_tag, any_close_tag, replace_with, ) # if <script> tags found, remove script content also script_open, script_close = make_html_tags("script") script_body = script_open + ... + script_close # translate HTML entities common_html_entity.set_parse_action(replace_html_entity) stripper = ( # parse quoted strings first, if they enclose HTML tags - keep these quoted_string # parse and translate HTML entities (&, <, >, etc.) | common_html_entity # expressions to be stripped - suppress() will remove them when transforming | ( html_comment | script_body | any_open_tag | any_close_tag ).suppress() ) repeated_newlines = LineEnd()[2, ...] repeated_newlines.set_parse_action(replace_with("\n\n")) if __name__ == '__main__': # get some HTML target_url = "https://wiki.python.org/moin/PythonDecoratorLibrary" with urlopen(target_url) as targetPage: target_html = targetPage.read().decode("UTF-8") # first pass, strip out tags and translate entities # (use transform_string() instead of parse_string - will do # suppressions and parse actions) first_pass = stripper.transform_string(target_html) # first pass leaves many blank lines, collapse these down second_pass = repeated_newlines.transform_string(first_pass) print(second_pass)