scrape_guilds.py: use playwright now

This commit is contained in:
Artemis Tosini 2024-10-31 05:43:36 +00:00
parent a233cfa432
commit 5d7349062d
Signed by: artemist
GPG key ID: EE5227935FE3FF18
2 changed files with 30 additions and 75 deletions

View file

@ -125,30 +125,8 @@
python3Packages.aiohttp python3Packages.aiohttp
python3Packages.black python3Packages.black
python3Packages.ipython python3Packages.ipython
python3Packages.trio
python3Packages.gql python3Packages.gql
(python3Packages.selenium.overrideAttrs (old: { python3Packages.playwright
postInstall =
old.postInstall
+ ''
for ver in v85 v126 v127 v128; do
DEVTOOLS=../common/devtools/chromium/$ver
for proto in js browser; do
python3 ../common/devtools/convert_protocol_to_json.py \
$DEVTOOLS/"$proto"_protocol.pdl \
--map_binary_to_string=true \
$DEVTOOLS/"$proto"_protocol.json
done
mkdir -p $DST_PREFIX/common/devtools/$ver
python3 generate.py \
$DEVTOOLS/browser_protocol.json \
$DEVTOOLS/js_protocol.json \
$DST_PREFIX/common/devtools/$ver
done
'';
nativeBuildInputs = old.nativeBuildInputs ++ [ python3Packages.inflection ];
}))
]; ];
CHROME = "${chromium}/bin/chromium"; CHROME = "${chromium}/bin/chromium";

View file

@ -3,82 +3,59 @@
Scrape guilds with selenium Scrape guilds with selenium
You probably shouldn't use this, but if you really must: You probably shouldn't use this, but if you really must:
* Set the CHROME environment variable to the path to your chrome or chromium * Set the CHROME environment variable to the path to your chrome or chromium
* You probably want to connect to an existing chrome instance so you can log in first,
run `chromium --user-data-dir=$HOME/.cache/chromium-emoji-script --remote-debugging-port=9222`
to start chromium then `env CHROME=$(command -v chromium) ./scrape_guilds.py 127.0.0.1:99222`
* It just gives you json, you have to dump later * It just gives you json, you have to dump later
""" """
import base64 import base64
import json import json
import os.path
import os import os
import os.path
import pathlib
import sys import sys
import zlib import zlib
from selenium import webdriver from playwright.async_api import async_playwright
import trio import asyncio
def setup_driver(): def websocket_handler(out_file):
options = webdriver.ChromeOptions() async def inner(ws):
if location := os.getenv("CHROME"): print(f"Got websocket at `{ws.url}`")
options.binary_location = location
options.add_argument("--start-maximized")
options.add_argument(
"--user-data-dir=" + os.path.expanduser("~/.cache/chromium-emoji-script")
)
if len(sys.argv) > 1:
options.add_experimental_option("debuggerAddress", sys.argv[1])
# options.add_experimental_option("detach", True) decompress = zlib.decompressobj()
return webdriver.Chrome(options=options)
async def handle_message(msg):
async def handle_events(listener, out_file): message = json.loads(decompress.decompress(msg))
valid_id = None
decompress = None
async for event in listener:
typ = event.__class__.__name__
if typ == "WebSocketCreated":
if event.url.startswith("wss://gateway.discord.gg/"):
valid_id = event.request_id
decompress = zlib.decompressobj()
elif typ == "WebSocketFrameReceived":
if event.request_id != valid_id:
continue
message = json.loads(
decompress.decompress(base64.b64decode(event.response.payload_data))
)
message_type = message.get("t") message_type = message.get("t")
await out_file.write(json.dumps(message) + "\n") out_file.write(json.dumps(message) + "\n")
# The data we actually want, might as well flush # The data we actually want, might as well flush
if message_type in ["READY", "GUILD_CREATE"]: if message_type in ["READY", "GUILD_CREATE"]:
await out_file.flush() out_file.flush()
print("Got message of type", message_type) print("Got message of type", message_type)
ws.on("framereceived", handle_message)
return inner
async def main(): async def main():
out_path = await trio.Path.cwd() / "out" / "discord" / "events.json" out_path = pathlib.Path.cwd() / "out" / "discord" / "events.json"
await out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
out_file = await out_path.open("a") out_file = out_path.open("a")
driver = setup_driver() async with async_playwright() as p:
async with driver.bidi_connection() as conn: browser = await p.chromium.launch_persistent_context(
devtools, session = conn.devtools, conn.session user_data_dir=os.path.expanduser("~/.cache/chromium-emoji-script"),
executable_path=os.getenv("CHROME"),
await session.execute(devtools.network.enable()) headless=False,
listener = session.listen(
devtools.network.WebSocketCreated,
devtools.network.WebSocketFrameReceived,
buffer_size=1024,
) )
page = await browser.new_page()
page.on("websocket", websocket_handler(out_file))
await page.goto("https://discord.com/app")
await handle_events(listener, out_file) await asyncio.Future()
if __name__ == "__main__": if __name__ == "__main__":
trio.run(main) asyncio.run(main())