Default page

Your IP: 216.73.216.118
Server IP: 88.222.222.98
Server: Linux sg-nme-web1518.main-hosting.eu 5.14.0-611.16.1.el9_7.x86_64 #1 SMP PREEMPT_DYNAMIC Mon Dec 22 03:40:39 EST 2025 x86_64
Server Software: LiteSpeed
PHP Version: 8.3.28
Buat File | Buat Folder

Dir : ~/opt/gsutil/third_party/pyparsing/examples/

View File Name : urlExtractor.py

# URL extractor
# Copyright 2004, Paul McGuire
from pyparsing import makeHTMLTags, pyparsing_common as ppc
from urllib.request import urlopen
import pprint

linkOpenTag, linkCloseTag = makeHTMLTags("a")

linkBody = linkOpenTag.tag_body
linkBody.setParseAction(ppc.stripHTMLTags)
linkBody.addParseAction(lambda toks: " ".join(toks[0].strip().split()))

link = linkOpenTag + linkBody("body") + linkCloseTag.suppress()

# Go get some HTML with some links in it.
with urlopen("https://www.cnn.com/") as serverListPage:
    htmlText = serverListPage.read().decode("UTF-8")

# scanString is a generator that loops through the input htmlText, and for each
# match yields the tokens and start and end locations (for this application, we are
# not interested in the start and end values).
for toks, strt, end in link.scanString(htmlText):
    print(toks.asList())

# Create dictionary from list comprehension, assembled from each pair of tokens returned
# from a matched URL.
pprint.pprint({toks.body: toks.href for toks, strt, end in link.scanString(htmlText)})