python - Convert structure large XML file To CSV without using tag names -
iam trying convert large xml file csv format , below code , sample file ,sample file (part of xml file )
<postaladdress> <id>5464443597076195439</id> <adduserid>system_user</adduserid> <adddate>2013-01-05t18:08:42-06:00</adddate> <lastpersistencetransactionuserid>system_user</lastpersistencetransactionuserid> <lastpersistencetransactiondate>2013-07-11t08:21:34-05:00</lastpersistencetransactiondate> <lastpersistencetransactiontype tc="2"/> <externalreferenceid>3200723</externalreferenceid> <schemaversion>2</schemaversion> <type tc="1"/> <usage tc="2"/> <valid>true</valid> <overridable>true</overridable> <preferred>false</preferred> <line1>4849 ronson ct</line1> <line2>ste 208</line2> <city>san diego</city> <state tc="6"/> <postalcode>92111</postalcode> <country tc="1"/> </postaladdress> below code
import codecs import xml.etree.elementtree et import sys class gokul: def __init__(self, input_file, output_file, encoding='utf-8'): self.output_buffer = [] self.output = none self.context = et.iterparse(input_file, bs4 import beautifulsoup=("start", "end")) try: self.output = codecs.open(output_file, "w", encoding=encoding) except: print("failed open output file") raise def convert(self, tag="item", delimiter=",", ignore=[], noheader=false, limit=-1, buffer_size=1000): event, root = next(self.context) items = [] header_line = [] field_name = '' tagged = false started = false n = 0 event, elem in self.context: should_write = elem.tag != tag , started , elem.tag not in ignore should_tag = not tagged , should_write , not noheader i=0 if event == 'start': if elem.tag == tag , not started: started = true elif should_tag: field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag else: if should_write: if should_tag: header_line.append(field_name) field_name = field_name.rpartition('_' + elem.tag)[0] items.append('' if elem.text none else elem.text.strip().replace('"', r'""')) elif elem.tag == tag , len(items) > 0: if header_line , not tagged: self.output.write(delimiter.join(header_line) + '\n') tagged = true self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"') items = [] n += 1 if n == limit: break if len(self.output_buffer) > buffer_size: self._write_buffer() elem.clear() self._write_buffer() self.output.close() return n def _write_buffer(self): """write records buffer output file""" self.output.write('\n'.join(self.output_buffer) + '\n') self.output_buffer = [] converter = gokul(sys.argv[1], sys.argv[2], encoding="utf-8") converter.convert(tag=sys.argv[3])
Comments
Post a Comment