scrape_guilds.py: use playwright now

This commit is contained in:
Artemis Tosini 2024-10-31 05:43:36 +00:00
parent a233cfa432
commit 5d7349062d
Signed by: artemist
GPG key ID: EE5227935FE3FF18
2 changed files with 30 additions and 75 deletions

View file

@ -125,30 +125,8 @@
python3Packages.aiohttp
python3Packages.black
python3Packages.ipython
python3Packages.trio
python3Packages.gql
(python3Packages.selenium.overrideAttrs (old: {
postInstall =
old.postInstall
+ ''
for ver in v85 v126 v127 v128; do
DEVTOOLS=../common/devtools/chromium/$ver
for proto in js browser; do
python3 ../common/devtools/convert_protocol_to_json.py \
$DEVTOOLS/"$proto"_protocol.pdl \
--map_binary_to_string=true \
$DEVTOOLS/"$proto"_protocol.json
done
mkdir -p $DST_PREFIX/common/devtools/$ver
python3 generate.py \
$DEVTOOLS/browser_protocol.json \
$DEVTOOLS/js_protocol.json \
$DST_PREFIX/common/devtools/$ver
done
'';
nativeBuildInputs = old.nativeBuildInputs ++ [ python3Packages.inflection ];
}))
python3Packages.playwright
];
CHROME = "${chromium}/bin/chromium";

View file

@ -3,82 +3,59 @@
Scrape guilds with selenium
You probably shouldn't use this, but if you really must:
* Set the CHROME environment variable to the path to your chrome or chromium
* You probably want to connect to an existing chrome instance so you can log in first,
run `chromium --user-data-dir=$HOME/.cache/chromium-emoji-script --remote-debugging-port=9222`
to start chromium then `env CHROME=$(command -v chromium) ./scrape_guilds.py 127.0.0.1:99222`
* It just gives you json, you have to dump later
"""
import base64
import json
import os.path
import os
import os.path
import pathlib
import sys
import zlib
from selenium import webdriver
import trio
from playwright.async_api import async_playwright
import asyncio
def setup_driver():
options = webdriver.ChromeOptions()
if location := os.getenv("CHROME"):
options.binary_location = location
options.add_argument("--start-maximized")
options.add_argument(
"--user-data-dir=" + os.path.expanduser("~/.cache/chromium-emoji-script")
)
if len(sys.argv) > 1:
options.add_experimental_option("debuggerAddress", sys.argv[1])
def websocket_handler(out_file):
async def inner(ws):
print(f"Got websocket at `{ws.url}`")
# options.add_experimental_option("detach", True)
return webdriver.Chrome(options=options)
async def handle_events(listener, out_file):
valid_id = None
decompress = None
async for event in listener:
typ = event.__class__.__name__
if typ == "WebSocketCreated":
if event.url.startswith("wss://gateway.discord.gg/"):
valid_id = event.request_id
decompress = zlib.decompressobj()
elif typ == "WebSocketFrameReceived":
if event.request_id != valid_id:
continue
message = json.loads(
decompress.decompress(base64.b64decode(event.response.payload_data))
)
async def handle_message(msg):
message = json.loads(decompress.decompress(msg))
message_type = message.get("t")
await out_file.write(json.dumps(message) + "\n")
out_file.write(json.dumps(message) + "\n")
# The data we actually want, might as well flush
if message_type in ["READY", "GUILD_CREATE"]:
await out_file.flush()
out_file.flush()
print("Got message of type", message_type)
ws.on("framereceived", handle_message)
return inner
async def main():
out_path = await trio.Path.cwd() / "out" / "discord" / "events.json"
await out_path.parent.mkdir(parents=True, exist_ok=True)
out_file = await out_path.open("a")
out_path = pathlib.Path.cwd() / "out" / "discord" / "events.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_file = out_path.open("a")
driver = setup_driver()
async with driver.bidi_connection() as conn:
devtools, session = conn.devtools, conn.session
await session.execute(devtools.network.enable())
listener = session.listen(
devtools.network.WebSocketCreated,
devtools.network.WebSocketFrameReceived,
buffer_size=1024,
async with async_playwright() as p:
browser = await p.chromium.launch_persistent_context(
user_data_dir=os.path.expanduser("~/.cache/chromium-emoji-script"),
executable_path=os.getenv("CHROME"),
headless=False,
)
page = await browser.new_page()
page.on("websocket", websocket_handler(out_file))
await page.goto("https://discord.com/app")
await handle_events(listener, out_file)
await asyncio.Future()
if __name__ == "__main__":
trio.run(main)
asyncio.run(main())