try use xmljson
to parse the data.
# !pip install xmljson
# https://pypi.org/project/xmljson/
from xmljson import parker, Parker
from xml.etree.ElementTree import fromstring
import requests
url = 'https://scsanctions.un.org/al-qaida/'
r = requests.get(url, headers={'User-Agent':'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36'})
data = re.sub(r'<?.*?>', '', r.text)
obj = parker.data(fromstring(data))
# change orderdict to normal dict
obj_dict = json.loads(json.dumps(obj))
df_INDIVIDUAL = pd.DataFrame(obj_dict['INDIVIDUALS']['INDIVIDUAL'])
df_INDIVIDUAL.columns
# Index(['DATAID', 'VERSIONNUM', 'FIRST_NAME', 'SECOND_NAME', 'THIRD_NAME',
# 'FOURTH_NAME', 'UN_LIST_TYPE', 'REFERENCE_NUMBER', 'LISTED_ON',
# 'NAME_ORIGINAL_SCRIPT', 'COMMENTS1', 'NATIONALITY', 'LIST_TYPE',
# 'LAST_DAY_UPDATED', 'INDIVIDUAL_ALIAS', 'INDIVIDUAL_ADDRESS',
# 'INDIVIDUAL_DATE_OF_BIRTH', 'INDIVIDUAL_PLACE_OF_BIRTH',
# 'INDIVIDUAL_DOCUMENT', 'SORT_KEY', 'SORT_KEY_LAST_MOD', 'GENDER',
# 'TITLE', 'SUBMITTED_BY'],
# dtype='object')
cols = 'DATAID,VERSIONNUM,FIRST_NAME,SECOND_NAME,THIRD_NAME,FOURTH_NAME,UN_LIST_TYPE'.split(',')
# df_INDIVIDUAL[cols].to_csv('INDIVIDUAL.csv', index=False)
df_INDIVIDUAL[cols]
result:
print(df_INDIVIDUAL[cols].head(3).T)
0 1 2
DATAID 6908576 6908757 6908443
VERSIONNUM 1 1 1
FIRST_NAME IYAD HAJJI 'ABD AL-MALIK
SECOND_NAME NAZMI 'ABD AL-NASIR MUHAMMAD
THIRD_NAME SALIH NaN YUSUF
FOURTH_NAME KHALIL NaN 'UTHMAN 'ABD AL-SALAM
UN_LIST_TYPE Al-Qaida Al-Qaida Al-Qaida
update:
Get repeated tag, e.g. <INDIVIDUAL_ALIAS>.
I am familiar with BeautifulSoup
to parse URL content, so I will use this to parse the content.
soup select use CSS style selector syntaxy.
select_one
return the first item or None
select
return a list
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'lxml')
data_list = []
for INDIVIDUAL in soup.select('INDIVIDUAL'):
dataid = INDIVIDUAL.select_one('dataid').text
for individual_alias in INDIVIDUAL.select('individual_alias'):
quality = individual_alias.select_one('quality')
alias_name = individual_alias.select_one('alias_name')
# if quality or alias_name not included in individual_alias, it will return None
quality = quality if quality == None else quality.text
alias_name = alias_name if alias_name == None else alias_name.text
data_list.append((dataid, quality, alias_name))
dfn = pd.DataFrame(data_list, columns=('dataid', 'alias_quality', 'alias_name'))
print(dfn.head())
dataid alias_quality alias_name
0 6908576 Good Ayyad Nazmi Salih Khalil
1 6908576 Good Eyad Nazmi Saleh Khalil
2 6908576 Low Iyad al-Toubasi
3 6908576 Low Iyad al-Tubasi
4 6908576 Low Abu al-Darda'
join to df_INDIVIDUAL:
# groupby dataid to join the alias_name together
alias_name = dfn.groupby('dataid')['alias_name'].agg(list).str.join('###')
# split by ###, and expand list to column
df_alias_name = alias_name.str.split('###', expand=True)
# column name add prefix
df_alias_name = df_alias_name.add_prefix('NDIVIDUAL_ALIAS.ALIAS_NAME/')
# set the DATAID as type int to join df_INDIVIDUAL
df_alias_name.index = df_alias_name.index.astype(int)
df_INDIVIDUAL = df_INDIVIDUAL.join(df_alias_name, on='DATAID', how='left')