#!/usr/bin/env python
# vim: set fileencoding=iso-8859-1
"""
$ pdftotext -layout -nopgbrk -f 303 -l 305 07_03pu.pdf page303.txt
$ python ParseAttributes.py page273-1000.txt out.txt > log2
$ grep ADD log2 | grep -v "Notes:" | grep -v "Note:" | grep -v "C.8" | grep -v "C.7"
"""
import re,os
"""
"""
class Attribute:
# Cstor
def __init__(self):
self._Name = ''
self._Tag = '(0000,0000)'
self._Type = ''
self._Description= ''
def SetInit(self,s):
# Should be something like:
# Blue Palette Color Lookup Table (0028,1103) 1C Specifies the format of the Blue Palette
patt = re.compile("^(.*)(\\([0-9A-Fx]+,[0-9A-F]+\\))\s+([1-3C]+)\s+(.*)\s*$")
m = patt.match(s)
if not m:
print s
assert 0
self._Name = m.group(1).strip()
self._Tag = m.group(2).strip()
self._Type = m.group(3).strip()
self._Description = m.group(4).strip()
def SetName(self,s):
self._Name = s
def AppendName(self,s):
self._Name += " "
self._Name += s.strip()
def SetTag(self,s):
self._Tag = s
def SetType(self,s):
self._Type = s
def SetDescription(self,s):
self._Description = s
def AppendDescription(self,s):
self._Description += " "
self._Description += s.strip()
def GetAsXML(self):
description = self._Description.replace('"','"')
description = description.replace('&','&')
return ""
def Print(self):
print self.GetAsXML()
class Part3Parser:
# Cstor
def __init__(self):
self._InputFilename = ''
self._OutputFilename = ''
self._Buffer = ''
self._CurrentAttribute = Attribute()
self._IsInTable = False
self._Shift = 0
def SetInputFileName(self,s):
self._InputFilename = s
def SetOutputFileName(self,s):
self._OutputFilename = s
def IsComment(self,s):
if len(s) == 0:
return True
patt1 = re.compile("^\s+- Standard -\s*$")
patt2 = re.compile("^\s*PS 3.3 - 2007\s*")
patt3 = re.compile("^\s*Page\s+[0-9]+\s*$")
patt4 = re.compile("^\s*Notes:$")
m1 = patt1.match(s)
m2 = patt2.match(s)
m3 = patt3.match(s)
m4 = patt4.match(s)
if(m1 or m2 or m3 or m4):
print "Comment:", s
return True
if self.IsTableDescription(s):
return True
return False
def IsStartTable(self,s):
#patt = re.compile("^\s+Table C[0-9a-z\.-]+.*\s+$")
patt = re.compile("^\s+Table\s+C.[0-9A-Za-z-.]+\s*$")
m = patt.match(s)
assert self._IsInTable != True
self._IsInTable = False
if s.strip() == 'Table C.7-23' or s.strip() == 'Table C.7-24' \
or s.strip() == 'Table C.7.6.10-1' \
or s.strip() == 'Table C.7-25' \
or s.strip() == 'Table C.7-26' \
or s.strip() == 'Table C.7-27' \
or s.strip() == 'Table C.8-8' \
or s.strip() == 'Table C.8-19' \
or s.strip() == 'Table C.8-20' \
or s.strip() == 'Table C.8-21' \
or s.strip() == 'Table C.8-22' \
or s.strip() == 'Table C.8-23' \
or s.strip() == 'Table C.8-80' \
or s.strip() == 'Table C.8-83' \
or s.strip() == 'Table C.8-84' \
or s.strip() == 'Table C.8-85' \
or s.strip() == 'Table C.8-86' \
or s.strip() == 'Table C.8-108' \
or s.strip() == 'Table C.8-109' \
or s.strip() == 'Table C.8-110' \
or s.strip() == 'Table C.8-111' \
or s.strip() == 'Table C.8-112' \
or s.strip() == 'Table C.8-115' \
or s.strip() == 'Table C.8-116' \
or s.strip() == 'Table C.8-127' \
or s.strip() == 'Table C.8-128' \
or s.strip() == 'Table C.8-129' \
or s.strip() == 'Table C.8-130' \
or s.strip() == 'Table C.8-131' \
or s.strip() == 'Table C.8-132' \
or s.strip() == 'Table C.8-133' \
or s.strip() == 'Table C.8-134' \
or s.strip() == 'Table C.8.19.2-2' \
or s.strip() == 'Table C.10-10' \
or s.strip() == 'Table C.11-4' \
or s.strip() == 'Table C.12-2' \
or s.strip() == 'Table C.12-3' \
or s.strip() == 'Table C.12-4' \
or s.strip() == 'Table C.12-5' \
or s.strip() == 'Table C.12-7' \
or s.strip() == 'Table C.13-1' \
or s.strip() == 'Table C.13-2' \
or s.strip() == 'Table C.13-3' \
or s.strip() == 'Table C.13-4' \
or s.strip() == 'Table C.13-5' \
or s.strip() == 'Table C.13-7' \
or s.strip() == 'Table C.13-8' \
or s.strip() == 'Table C.13-9' \
or s.strip() == 'Table C.13-13' \
or s.strip() == 'Table C.14-1' \
or s.strip() == 'Table C.17.3-7' \
or s.strip() == 'Table C.17.3-8' \
or s.strip() == 'Table C.22.1-1':
# C.11-4, C.13-*, C.22.1-1: Does not even comes with column type !!!
# C.12-7 is difficult to parse
# C.7.6.16-1 is insane...
# TODO: Last line of C.19-1...
return False
if(m):
print "Start", s
self._IsInTable = True
return True
# grrrrr: Table C.8-37 - RT SERIES MODULE ATTRIBUTES
patt = re.compile("^\s+Table\s+C.[0-9A-Za-z-]+\s*[-]*\s*([A-Z/\s-]+)\s*$")
#patt = re.compile("^\s+Table\s+C.[0-9A-Za-z-]+[-\s]+([A-Z/\s-]+)\s*$")
m = patt.match(s)
if(m):
print "Start", s
self._IsInTable = True
return True
print "IsTable failed with:", s
return False
def IsEndTable(self,s):
assert self._IsInTable == True
assert not self.IsComment(s)
self._IsInTable = False
return True
def IsTableName(self,s):
patt = re.compile("^\s*[A-Z/\s-]+ATTRIBUTES\s*$") #MACRO/MODULE
m = patt.match(s)
if(m):
print "Table Name", s
return True
patt = re.compile("^\s+[A-Za-z\s]+Attributes\s*$") #MACRO/MODULE
m = patt.match(s)
if(m):
print "Table Name", s
return True
# PALETTE COLOR LOOKUP MODULE
patt = re.compile("^\s+[A-Z\s]+MODULE\s*$") #MACRO/MODULE
m = patt.match(s)
if(m):
print "Table Name", s
return True
# MR IMAGE AND SPECTROSCOPY INSTANCE MACRO
patt = re.compile("^\s+[A-Z\s]+MACRO\s*$") #MACRO/MODULE
m = patt.match(s)
if(m):
print "Table Name", s
return True
# Enhanced XA/XRF Image Module Table
patt = re.compile("^\s+[A-Z/a-z\s]+Module Table\s*$")
m = patt.match(s)
if(m):
print "Table Name", s
return True
# Presentation LUT Module
#patt = re.compile("^\s+Presentation LUT Module\s*$")
#m = patt.match(s)
#if(m):
# print "Table Name", s
# return True
print "TableName failed with:", s
return False
def IsTableName2(self,s):
# grrrrr: Table C.8-37 - RT SERIES MODULE ATTRIBUTES
# Table C.8-39--RT DOSE MODULE ATTRIBUTES
patt = re.compile("^\s+Table\s+C.[0-9A-Za-z-]+\s*[-]*\s*([A-Z/\s-]+)\s*$")
m = patt.match(s)
# The previous regex would think : Table C.7-17A
# is correct...I don't know how to fix the regex, so discard result if
# len(m.group(1)) <= 1
if(m and len(m.group(1)) > 1):
print "Table Name:", m.group(1)
assert self.IsTableName( m.group(1) )
return True
print "TableName2 failed with:", s
return False
def IsTableDescription(self,s):
patt = re.compile("^\s*Attribute Name\s+Tag\s+Type\s+Attribute Description\s*$")
m = patt.match(s)
if(m):
print "Table Description:", s
return True
# Around page 574
patt = re.compile("^\s*Attribute [Nn]ame\s+Tag\s+Type\s+Description\s*$")
m = patt.match(s)
if(m):
print "Table Description:", s
return True
return False
def IsFirstLineAttribute(self,s):
# Line should look like:
# Bits Stored ... (0028,0101) ... 1 ... Number of bits stored for each pixel
patt = re.compile("^\s*(.*)\\([0-9A-Fx]+,[0-9A-F]+\\)\s+([1-3C]+).*\s*$") #MACRO/MODULE
m = patt.match(s)
if(m):
s1 = m.group(1).strip()
if s1 == '':
return False
#print "First Line Attribute:", s1, s
return True
#print "No:", s
return False
def IsIncludeTable(self,s):
# Need to support : "Include `Image Pixel Macro' Table C.7-11b"
#assert self._Shift == 0
#print "Include:", s
#patt = re.compile("^>*Include `(.*)' Table [A-Z0-9a-z-.]+$")
#m = patt.match(s)
#if m:
# return True
#patt = re.compile("^>*Include [`|'](.*)' Table [A-Z0-9a-z-.]+\s+Defined Context ID is.*$")
#m = patt.match(s)
#return m
#print "FALLBACK"
patt = re.compile("^>*\s*Include [`'\"]*([A-Za-z/ -]*)['\"]* \\(*Table [A-Z0-9a-z-.]+\\)*.*$")
m = patt.match(s)
#if not m:
# print "FAIL", s
return m
def IsNextLineAttribute(self,s):
if self._Shift == 0:
print "IsNextLineAttribute failed with", s
return False
if len(s) <= self._Shift:
print "IsNextLineAttribute failed with", s
return False
blank = s[0:self._Shift]
blank = blank.strip()
#print "BLANK:", blank
if blank == '':
self._CurrentAttribute.AppendDescription( s )
return True
# The following is really ugly ... need to be fixed
if blank == 'Descriptor' or blank == 'Data' or blank == 'Center Name' \
or blank == 'Description' \
or blank == 'Sequence' \
or blank == 'Distance' \
or blank == 'Index' \
or blank == 'Reordering' \
or blank == 'Time' \
or blank == 'Device Number' \
or blank == 'Justification' \
or blank == 'Shape' \
or blank == 'Relationship' \
or blank == 'in Float' \
or blank == 'Displacement' \
or blank == 'Technique Description' \
or blank == 'Left Vertical Edge' \
or blank == 'State Sequence' \
or blank == 'In-plane' \
or blank == 'Certification Number' \
or blank == 'Right Vertical Edge' \
or blank == 'Accumulated' \
or blank == 'Equivalent Thickness' \
or blank == 'Distances' \
or blank == 'Definition' \
or blank == 'Upper Horizontal Edge' \
or blank == 'Modification' \
or blank == 'Power Ratio' \
or blank == 'Lower Horizontal Edge' \
or blank == 'Device Distance' \
or blank == 'Sensing Region' \
or blank == 'Control Sensing Region' \
or blank == 'Water Equivalent Thickness' \
or blank == 'Columns' \
or blank == 'Rows' \
or blank == 'Ratio' \
or blank == 'Display Grayscale Value' \
or blank == 'Display CIELab Value' \
or blank == 'UID' \
or blank == 'Units' \
or blank == 'Pointer' \
or blank == 'Value' \
or blank == 'Annotation' \
or blank == 'Pointer Private Creator' \
or blank == 'Creator' \
or blank == 'Value Mapping Sequence' \
or blank == 'Performed Procedure' \
or blank == 'MAC Sequence' \
or blank == 'Class UID' \
or blank == 'Instance UID' \
or blank == 'Syntax UID' \
or blank == 'Used' \
or blank == 'Identifier' \
or blank == 'Datetime' \
or blank == 'plane Phase Steps' \
or blank == '(Patient)' \
or blank == 'Collection Center' \
or blank == 'Technique' \
or blank == 'Interpretation' \
or blank == 'Representation' \
or blank == 'Configuration' \
or blank == 'Compression' \
or blank == 'Reference Code' \
or blank == 'Encoding Steps' \
or blank == 'Steps in-plane' \
or blank == 'Steps out-of-plane' \
or blank == 'Type' \
or blank == 'Explanation' \
or blank == 'Mapped' \
or blank == 'Calibration' \
or blank == 'Manufactured' \
or blank == 'Thickness' \
or blank == 'Reference Sequence' \
or blank == 'Reference Number' \
or blank == 'Transmission' \
or blank == 'Matrix' \
or blank == 'Comment' \
or blank == 'Setup Sequence' \
or blank == 'Setup Number' \
or blank == 'Fraction' \
or blank == 'Tolerance' \
or blank == 'Number' \
or blank == 'Day' \
or blank == 'Parameters' \
or blank == 'Coefficient' \
or blank == 'Specification Point' \
or blank == 'Identification Sequence' \
or blank == 'Reference UID' \
or blank == 'Synchronized' \
or blank == 'Description Code Sequence' \
or blank == 'Concentration' \
or blank == 'Procedure Step' \
or blank == 'Manufacturer' \
or blank == 'Lookup Table Data' \
or blank == 'Version' \
or blank == 'Images' \
or blank == 'Wavelength' \
or blank == 'Code Sequence' \
or blank == 'Housing' \
or blank == 'Exposure' \
or blank == 'Beam' \
or blank == 'Angle' \
or blank == 'Rotation Angle' \
or blank == 'Corner' \
or blank == 'Factor' \
or blank == 'Product' \
or blank == "Manufacturer's Model Name" \
or blank == 'Qualifier Code' \
or blank == 'Mapping Instance Sequence' \
or blank == 'Channels' \
or blank == 'Samples' \
or blank == 'Transformation Comment' \
or blank == 'Pixels' \
or blank == 'Correction Factor' \
or blank == 'Group' \
or blank == 'Amount' \
or blank == 'Priority' \
or blank == 'Group Name' \
or blank == 'Frame Rate' \
or blank == 'Presence' \
or blank == 'Sequencing' \
or blank == 'Orientation' \
or blank == 'Inverted' \
or blank == 'Numbers' \
or blank == 'Flag' \
or blank == 'Annotation Flag' \
or blank == 'Demographics Flag' \
or blank == 'Techniques Flag' \
or blank == 'Group Description' \
or blank == 'Handling' \
or blank == 'Initial View Direction' \
or blank == 'Identification Code Sequence' \
or blank == 'Identification Code' \
or blank == 'Category' \
or blank == 'Spatial Position' \
or blank == 'Creation Datetime' \
or blank == 'Grayscale Bit Depth' \
or blank == 'Bit Depth' \
or blank == 'Repaint Time' \
or blank == 'Definition Sequence' \
or blank == 'Procedure Code' \
or blank == 'Referenced' \
or blank == 'Reference' \
or blank == 'Usage Flag' \
or blank == 'Horizontal Dimension' \
or blank == 'Dimension' \
or blank == 'Direction' \
or blank == 'Registration Sequence' \
or blank == 'Transformation Matrix' \
or blank == 'Transformation Matrix Type' \
or blank == 'Step Sequence':
self._CurrentAttribute.AppendName( blank )
self._CurrentAttribute.AppendDescription( s[self._Shift:] )
return True
else:
print "ADD KEYWORD:", blank
return False
def FindShiftValue(self,s):
# Line should look like:
# Bits Stored ... (0028,0101) ... 1 ... Number of bits stored for each pixel
patt = re.compile("^[A-Za-z0-9ยต /()'>-]+\s+\\([0-9A-Fx]+,[0-9A-F]+\\)\s+[1-3][C]*\s+(.*)$")
m = patt.match(s)
if(m):
# worse case happen around page 448 with `Required`
# worse case happen around page 475 with `LOG`...
self._Shift = s.find( m.group(1) ) - 17
return self._Shift
print "OUCH:", s
return 0
def Open(self):
#self._Infile = file(self._InputFilename, 'r')
#for line in self._Infile.readlines():
# line = line[:-1] # remove '\n'
# if( self.IsStartTable(line) ):
# print line.next()
cmd_input = open(self._InputFilename,'r')
outfile = open(self._OutputFilename, 'w')
# To support some weird output from pdftotext
outfile.write( '' )
outfile.write( '' )
for line_ori in cmd_input:
#while line.startswith('%') : # skip comment lines
#print "!!!",line
#line= cmd_input.next()
line = line_ori[:-1]
if( self.IsStartTable(line) ):
table_name_found = self.IsTableName2(line)
line2 = line
# Okay table is on next line:
if ( not table_name_found ):
line2 = cmd_input.next()[:-1]
table_name_found = self.IsTableName(line2)
# Either way we need to find the table name
assert table_name_found
if( table_name_found ):
line3 = cmd_input.next()[:-1]
if( self.IsTableDescription(line3) ):
# Ok we found a table
outfile.write(
""
)
buffer = ''
self._CurrentAttribute = Attribute()
self._Shift = 0
for subline_ori in cmd_input:
subline = subline_ori[:-1]
if( self.IsIncludeTable(subline)):
# BUG DO NOT SUPPORT MULTI_LINE INCLUDE
#print "Include Table:", subline
if( subline != '' ):
outfile.write( "" )
outfile.write( '\n' )
elif( self.IsFirstLineAttribute(subline)):
#print "Previous Buffer was: ", buffer
if( buffer != '' ):
outfile.write( self._CurrentAttribute.GetAsXML() )
outfile.write( '\n' )
self._CurrentAttribute.SetInit(subline)
self.FindShiftValue(subline)
assert self._Shift != 0
buffer = subline
else:
if( not self.IsComment(subline) ):
#print "Found Comment: ", subline
if( self.IsNextLineAttribute(subline) ):
buffer += ' ' + subline.strip()
else:
print "Wotsit:", subline
self._Shift = 0
self._IsInTable = False
if( buffer != '' ):
outfile.write( self._CurrentAttribute.GetAsXML() )
outfile.write( '\n' )
outfile.write( '
' )
break
#print "Working on: ", subline
if not subline_ori:
break
else:
print "Problem with:", line, line2
#line = cmd_input.next()
if not line_ori: break
cmd_input.close()
outfile.write( '' )
self.Write()
def Write(self):
print "Write"
# Main function to call for parsing
def Parse(self):
self.Open()
if __name__ == "__main__":
argc = len(os.sys.argv )
if ( argc < 3 ):
print "Sorry, wrong list of args"
os.sys.exit(1) #error
inputfilename = os.sys.argv[1]
outputfilename = os.sys.argv[2]
tempfile = "/tmp/mytemp2"
dp = Part3Parser()
dp.SetInputFileName( inputfilename )
dp.SetOutputFileName( tempfile )
dp.Parse()