python - Convert structure large XML file To CSV without using tag names -


iam trying convert large xml file csv format , below code , sample file ,sample file (part of xml file )

<postaladdress>     <id>5464443597076195439</id>     <adduserid>system_user</adduserid>     <adddate>2013-01-05t18:08:42-06:00</adddate>     <lastpersistencetransactionuserid>system_user</lastpersistencetransactionuserid>     <lastpersistencetransactiondate>2013-07-11t08:21:34-05:00</lastpersistencetransactiondate>     <lastpersistencetransactiontype tc="2"/>     <externalreferenceid>3200723</externalreferenceid>     <schemaversion>2</schemaversion>     <type tc="1"/>     <usage tc="2"/>     <valid>true</valid>     <overridable>true</overridable>     <preferred>false</preferred>     <line1>4849 ronson ct</line1>     <line2>ste 208</line2>     <city>san diego</city>     <state tc="6"/>     <postalcode>92111</postalcode>     <country tc="1"/> </postaladdress> 

below code

import codecs import xml.etree.elementtree et import sys class gokul:      def __init__(self, input_file, output_file, encoding='utf-8'):          self.output_buffer = []         self.output = none        self.context = et.iterparse(input_file, bs4 import beautifulsoup=("start", "end"))         try:             self.output = codecs.open(output_file, "w", encoding=encoding)         except:             print("failed open output file")             raise       def convert(self, tag="item", delimiter=",", ignore=[], noheader=false,                 limit=-1, buffer_size=1000):         event, root = next(self.context)          items = []         header_line = []         field_name = ''          tagged = false         started = false         n = 0          event, elem in self.context:              should_write = elem.tag != tag , started , elem.tag not in ignore             should_tag = not tagged , should_write , not noheader             i=0               if event == 'start':                 if elem.tag == tag , not started:                     started = true                 elif should_tag:                     field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag              else:                 if should_write:                     if should_tag:                         header_line.append(field_name)                           field_name = field_name.rpartition('_' + elem.tag)[0]                     items.append('' if elem.text none else elem.text.strip().replace('"', r'""'))                  elif elem.tag == tag , len(items) > 0:                      if header_line , not tagged:                         self.output.write(delimiter.join(header_line) + '\n')                     tagged = true                     self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')                     items = []                     n += 1                     if n == limit:                         break                     if len(self.output_buffer) > buffer_size:                         self._write_buffer()                  elem.clear()            self._write_buffer()           self.output.close()          return n     def _write_buffer(self):         """write records buffer output file"""          self.output.write('\n'.join(self.output_buffer) + '\n')         self.output_buffer = []           converter = gokul(sys.argv[1], sys.argv[2], encoding="utf-8")          converter.convert(tag=sys.argv[3])  


Comments

Popular posts from this blog

java - pagination of xlsx file to XSSFworkbook using apache POI -

Unlimited choices in BASH case statement -

apache - How do I stop my index.php being run twice for every user -