mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Add script for scraping Smogon movesets
This commit is contained in:
parent
cc483e1877
commit
6beedb7871
1 changed files with 216 additions and 0 deletions
216
scripts/scrape-smogon-data.py
Normal file
216
scripts/scrape-smogon-data.py
Normal file
|
@ -0,0 +1,216 @@
|
|||
# Simple, stupid, run-and-done script
|
||||
# Scrape, gather, and clean competitive moveset data from Smogon
|
||||
# Saves results to a CSV file in /pokedex/data/csv/smogon_movesets.csv
|
||||
# Requires requests, more info here: https://docs.python-requests.org/en/latest/
|
||||
|
||||
import csv, requests, json
|
||||
from datetime import datetime, timedelta
|
||||
from multiprocessing import pool
|
||||
from time import time
|
||||
|
||||
# Modify these as-needed
|
||||
gens = ["rb", "gs", "rs", "dp", "bw", "xy", "sm", "ss"]
|
||||
path = "../pokedex/data/csv/smogon_movesets.csv"
|
||||
|
||||
def getPkmnNames(gen):
|
||||
if not isinstance(gen, str) or not gen in gens:
|
||||
raise TypeError("Invalid generation provided")
|
||||
else:
|
||||
r = requests.get("https://smogon.com/dex/{}/pokemon".format(gen))
|
||||
htmlString = r.text
|
||||
|
||||
htmlString = htmlString[htmlString.find("dexSettings"):].split("\n")[0][14:]
|
||||
fullData = json.loads(htmlString)["injectRpcs"][1][1]["pokemon"]
|
||||
|
||||
names = [entry["name"] for entry in fullData]
|
||||
|
||||
return names
|
||||
|
||||
def getStrategies(gen, pokemon):
|
||||
if not isinstance(gen, str):
|
||||
raise TypeError("Invalid generation provided")
|
||||
elif not gen in gens:
|
||||
raise ValueError("Invalid generation provided")
|
||||
elif not isinstance(pokemon, str):
|
||||
raise TypeError("Invalid generation provided")
|
||||
else:
|
||||
r = requests.get("https://smogon.com/dex/{0}/pokemon/{1}/".format(gen, pokemon), timeout=10)
|
||||
htmlString = r.text
|
||||
htmlString = htmlString[htmlString.find("dexSettings"):].split("\n")[0][14:]
|
||||
return [gen, json.loads(htmlString)["injectRpcs"][2][1]["strategies"]]
|
||||
|
||||
def flattenData(stratDict):
|
||||
flatData = []
|
||||
for pkmnName in stratDict.keys():
|
||||
for gen in stratDict[pkmnName].keys():
|
||||
for format in stratDict[pkmnName][gen]:
|
||||
for moveset in format["movesets"]:
|
||||
row = [pkmnName,
|
||||
gen,
|
||||
format["format"],
|
||||
format["overview"],
|
||||
format["comments"]]
|
||||
for key in moveset.keys():
|
||||
if key in ["levels", "abilities", "items", "natures"]:
|
||||
itemString = ""
|
||||
for entry in moveset[key]:
|
||||
itemString = itemString + "{}".format(entry) + ", "
|
||||
if len(itemString) > 0:
|
||||
itemString = itemString[0:len(itemString) - 2]
|
||||
row.append(itemString)
|
||||
elif key == "moveslots":
|
||||
for moveslot in moveset[key]:
|
||||
itemString = ""
|
||||
for move in moveslot:
|
||||
itemString = itemString + "{}".format(move["move"])
|
||||
if not move["type"] == None:
|
||||
itemString = itemString + " ({})".format(move["type"])
|
||||
itemString = itemString + "/"
|
||||
if len(itemString) > 0:
|
||||
itemString = itemString[0:len(itemString) - 1]
|
||||
row.append(itemString)
|
||||
elif key in ["evconfigs", "ivconfigs"]:
|
||||
itemString = ""
|
||||
for item in moveset[key]:
|
||||
for stat in item.keys():
|
||||
itemString = itemString + "{}".format(item[stat]) + "/"
|
||||
if len(itemString) > 0:
|
||||
itemString = itemString[0:len(itemString) - 1]
|
||||
itemString = itemString + ", "
|
||||
if len(itemString) > 0:
|
||||
itemString = itemString[0:len(itemString) - 2]
|
||||
row.append(itemString)
|
||||
else:
|
||||
row.append(moveset[key])
|
||||
itemString = ""
|
||||
for item in format["credits"]["teams"]:
|
||||
itemString = itemString + "{}:\n".format(item["name"])
|
||||
for member in item["members"]:
|
||||
if len(member.keys()) == 0:
|
||||
continue
|
||||
itemString = itemString + "- {0}".format(member["username"])
|
||||
if "user_id" in member.keys():
|
||||
itemString = itemString + " (User ID: {})".format(member['user_id'])
|
||||
itemString = itemString + "\n"
|
||||
itemString = itemString[0:len(itemString) - 1]
|
||||
row.append(itemString)
|
||||
itemString = ""
|
||||
for member in format["credits"]["writtenBy"]:
|
||||
itemString = itemString + "- {0}".format(member["username"])
|
||||
if "user_id" in member.keys():
|
||||
itemString = itemString + " (User ID: {})".format(member['user_id'])
|
||||
itemString = itemString + "\n"
|
||||
itemString = itemString[0:len(itemString) - 1]
|
||||
row.append(itemString)
|
||||
flatData.append(row)
|
||||
return flatData
|
||||
|
||||
def main():
|
||||
stratDict = {}
|
||||
procs = []
|
||||
p = pool.Pool()
|
||||
print("The following generations are enabled for collection:")
|
||||
print(gens)
|
||||
print("Process pool created, gathering Pokemon names from Smogon...")
|
||||
startTime = time()
|
||||
print("Data collection started at {}".format(datetime.fromtimestamp(startTime)))
|
||||
try:
|
||||
for gen in gens:
|
||||
for pokemonName in getPkmnNames(gen):
|
||||
if not pokemonName in stratDict.keys():
|
||||
stratDict[pokemonName] = {}
|
||||
procs.append([pokemonName, p.apply_async(getStrategies, (gen, pokemonName))])
|
||||
except Exception as e:
|
||||
print("An error occurred during the setup process.\n" +
|
||||
"This likely means that the Smogon servers were unreachable in some way. " +
|
||||
"For more information, please consult the exception below:")
|
||||
print(e)
|
||||
print("The program will now close.")
|
||||
return -1
|
||||
p.close()
|
||||
print("Names gathered, process pool closed.")
|
||||
print("Gathering strategy data from Smogon...")
|
||||
numComplete = sum(proc[1].ready() for proc in procs)
|
||||
while numComplete == 0:
|
||||
numComplete = sum(proc[1].ready() for proc in procs)
|
||||
numProcs = len(procs)
|
||||
while numComplete < numProcs:
|
||||
for proc in procs:
|
||||
if proc[1].ready():
|
||||
try:
|
||||
proc[1].get()
|
||||
except Exception as e:
|
||||
print("A network error has occurred.\n" +
|
||||
"Typically, this means one of the following:\n" +
|
||||
"1) Your internet connection was lost\n" +
|
||||
"2) The Smogon servers could not be reached in time\n" +
|
||||
"3) There is an issue with either your network or DNS that is preventing you from downloading the target webpage.\n" +
|
||||
"The exception details are shown below:")
|
||||
print(e)
|
||||
print("The program will now close.")
|
||||
return -1
|
||||
timePerProc = (time() - startTime)/numComplete
|
||||
estTimeLeft = timePerProc * (numProcs - numComplete)
|
||||
print("{0}/{1} processes complete, estimated remaining time: {2}".format(numComplete,
|
||||
numProcs,
|
||||
timedelta(seconds=estTimeLeft)),
|
||||
end='\r')
|
||||
numComplete = sum(proc[1].ready() for proc in procs)
|
||||
p.join()
|
||||
print("All download processes complete, process pool joined at {}".format(datetime.fromtimestamp(time())))
|
||||
print("Adding data to strategy dictionary...")
|
||||
for proc in procs:
|
||||
stratDict[proc[0]][proc[1].get()[0]] = proc[1].get()[1]
|
||||
print("Data added to strategy dictionary.")
|
||||
print("\"Flattening\" data...")
|
||||
flatData = flattenData(stratDict)
|
||||
print("Data flattened.")
|
||||
print("Checking for suspicious rows...")
|
||||
for row in flatData:
|
||||
for item in row:
|
||||
if not type(item) in (str, bool, int, float):
|
||||
print("The following row contains data which may cause errors during CSV conversion:")
|
||||
print(row)
|
||||
print("Acceptable data types are str, bool, int, or float. Anything else may cause errors when using the data.")
|
||||
shouldCont = input("If you would like to proceed with the conversion anyways, enter \"y\" to continue: ")
|
||||
if not shouldCont == "y":
|
||||
print("Conversion cancelled.")
|
||||
return -1
|
||||
print("All rows passed.")
|
||||
print("Opening CSV file...")
|
||||
with open(path, "w+") as destination:
|
||||
print("CSV file opened successfully, writing data...")
|
||||
writer = csv.writer(destination)
|
||||
header = ["name",
|
||||
"gen",
|
||||
"format",
|
||||
"overview",
|
||||
"comments",
|
||||
"set name",
|
||||
"pokemon",
|
||||
"shiny",
|
||||
"gender",
|
||||
"levels",
|
||||
"description",
|
||||
"abilities",
|
||||
"items",
|
||||
"move 1",
|
||||
"move 2",
|
||||
"move 3",
|
||||
"move 4",
|
||||
"ev configs",
|
||||
"iv configs",
|
||||
"natures",
|
||||
"writing teams",
|
||||
"Written by"]
|
||||
writer.writerow(header)
|
||||
for row in flatData:
|
||||
writer.writerow(row)
|
||||
print("All rows written, closing file...")
|
||||
destination.close()
|
||||
print("Data saved successfully.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in a new issue