Cleanup handling of TZ name parsing

Fully support the format where a TZ name is in parentheses after the
time (and possibly after an offset). This fixes the broken case where a
4 character TZ name was in parentheses after a time.
This commit is contained in:
Klondike Dragon 2023-12-30 12:10:37 -07:00
parent c4de5d4f6a
commit d5b3c60e9b
2 changed files with 109 additions and 70 deletions

View File

@ -1895,9 +1895,8 @@ iterRunes:
if !p.setYear() {
return p, p.unknownErr(datestr)
}
} else {
// allow multiple trailing whitespace
}
// else allow multiple trailing whitespace
case '+', '-':
// The year must be followed by a space before an offset!
if p.yearlen > 0 {
@ -1942,12 +1941,10 @@ iterRunes:
} else {
p.tzlen = i - p.tzi
}
if p.tzlen == 4 {
p.set(p.tzi, " MST")
} else if p.tzlen == 3 {
p.set(p.tzi, "MST")
} else if p.tzlen > 0 {
return p, p.unknownErr(datestr)
if p.tzlen > 0 {
if err := p.setTZName(datestr); err != nil {
return p, err
}
}
p.stateTime = timeWsAlphaZoneOffset
p.offseti = i
@ -1956,12 +1953,8 @@ iterRunes:
// 17:57:51 MST
// 06:20:00 (EST)
p.tzlen = i - p.tzi
if p.tzlen == 4 {
p.set(p.tzi, " MST")
} else if p.tzlen == 3 {
p.set(p.tzi, "MST")
} else if p.tzlen > 0 {
return p, p.unknownErr(datestr)
if err := p.setTZName(datestr); err != nil {
return p, err
}
if r == ' ' {
p.stateTime = timeWsAlphaWs
@ -2205,19 +2198,8 @@ iterRunes:
case r == ' ':
if p.tzi > 0 {
p.tzlen = i - p.tzi
switch p.tzlen {
case 3:
// 13:31:51.999 +01:00 CET
p.set(p.tzi, "MST")
case 4:
// 13:31:51.999 +01:00 CEST
p.set(p.tzi, "MST ")
default:
if p.simpleErrorMessages {
return p, ErrUnknownTimeZone
} else {
return p, fmt.Errorf("%w %q near %q (must be 3 or 4 characters)", ErrUnknownTimeZone, datestr, p.datestr[p.tzi:p.tzi+p.tzlen])
}
if err := p.setTZName(datestr); err != nil {
return p, err
}
} else {
return p, p.unknownErr(datestr)
@ -2353,18 +2335,9 @@ iterRunes:
switch p.stateTime {
case timeWsAlpha:
switch len(p.datestr) - p.tzi {
case 3:
// 13:31:51.999 +01:00 CET
p.set(p.tzi, "MST")
case 4:
p.set(p.tzi, "MST ")
default:
if p.simpleErrorMessages {
return p, ErrUnknownTimeZone
} else {
return p, fmt.Errorf("%w %q near %q (must be 3 or 4 characters)", ErrUnknownTimeZone, datestr, p.datestr[p.tzi:])
}
p.tzlen = i - p.tzi
if err := p.setTZName(datestr); err != nil {
return p, err
}
case timeWsAlphaRParen:
@ -2377,10 +2350,26 @@ iterRunes:
}
case timeWsOffsetWsTZDescInParen:
// The last character must be a closing ')'
if len(p.datestr) <= 0 || p.datestr[i-1] != ')' {
if i <= 0 || p.datestr[i-1] != ')' {
return p, p.unknownErr(datestr)
}
p.trimExtra(false)
// As a special case, if we don't yet have a timezone name,
// and the content in the paren is 3-4 characters, then treat
// this as a time zone name instead
if len(p.datestr) >= p.extra+1+3+1 {
parenContentsLen := (i - 1) - (p.extra + 2)
if p.tzi == 0 && (parenContentsLen >= 3 && parenContentsLen <= 4) {
p.tzi = p.extra + 2
p.tzlen = parenContentsLen
if err := p.setTZName(datestr); err != nil {
return p, err
}
p.extra = 0
}
}
if p.extra > 0 {
p.trimExtra(false)
}
case timeWsAlphaZoneOffset:
// 06:20:00 UTC-05
if err := p.setTZOffset(i, datestr); err != nil {
@ -2418,19 +2407,9 @@ iterRunes:
case timeWsOffsetWsAlphaZone:
// 00:12:00 +0000 UTC
if p.tzi > 0 {
switch len(p.datestr) - p.tzi {
case 3:
// 13:31:51.999 +01:00 CET
p.set(p.tzi, "MST")
case 4:
// 13:31:51.999 +01:00 CEST
p.set(p.tzi, "MST ")
default:
if p.simpleErrorMessages {
return p, ErrUnknownTimeZone
} else {
return p, fmt.Errorf("%w %q near %q (must be 3 or 4 characters)", ErrUnknownTimeZone, datestr, p.datestr[p.tzi:])
}
p.tzlen = i - p.tzi
if err := p.setTZName(datestr); err != nil {
return p, err
}
} else {
return p, p.unknownErr(datestr)
@ -2940,6 +2919,44 @@ func (p *parser) setTZOffset(i int, datestr string) error {
return nil
}
func (p *parser) setTZName(datestr string) error {
switch p.tzlen {
case 3:
p.set(p.tzi, "MST")
case 4:
p.set(p.tzi, "MST ")
default:
if p.simpleErrorMessages {
return ErrUnknownTimeZone
} else {
return fmt.Errorf("%w %q near %q (must be 3 or 4 characters)", ErrUnknownTimeZone, datestr, p.datestr[p.tzi:p.tzi+p.tzlen])
}
}
return nil
}
// Removes the characters at the given range from the format string.
// Fills the end of the format string with spaces rather than shortening it.
func (p *parser) removeRangeFromFormat(i, numBytes int) {
if i < 0 || i >= len(p.format) {
return
}
var startErase int
afterRemovedRange := i + numBytes
bytesToCopy := len(p.format) - afterRemovedRange
if bytesToCopy <= 0 {
// nothing to copy, erase everything from the removal point
startErase = i
} else {
copy(p.format[i:], p.format[afterRemovedRange:])
startErase = i + bytesToCopy
}
// fill in spaces to erase the moved content in its old location
for index := startErase; index < len(p.format); index++ {
p.format[index] = ' '
}
}
// Find the proper end of the current component (scanning chars starting from start and going
// up until the end, and either returning at end or returning the first character that is
// not allowed, as determined by allowNumeric, allowAlpha, and allowOther)
@ -3097,6 +3114,26 @@ func (p *parser) parse(originalLoc *time.Location, originalOpts ...ParserOption)
if p.t != nil {
return *p.t, nil
}
// Make sure that the entire string matched to a known format that was detected
if !p.allowPartialStringMatch && p.formatSetLen < len(p.format) {
// We can always ignore punctuation at the end of a date/time, but do not allow
// any numbers or letters in the format string.
validFormatTo := findProperEnd(bytesToString(p.format), p.formatSetLen, len(p.format), false, false, true)
if validFormatTo < len(p.format) {
return time.Time{}, p.unexpectedTail(p.formatSetLen)
}
}
// Special case where the TZ name is 4 characters long and followed by punctuation, will cause parsing problems
// with the format 'MST ' (will expect a whitespace that isn't there after 4 char timezone). Most robust
// solution is to remove the extra whitespace. Even though it will cause offsets after this point to not match
// between the datestr and format string, it's not an issue at this point.
if p.tzlen == 4 && p.tzi+4 < len(p.format) && p.format[p.tzi+3] == ' ' && p.format[p.tzi+4] != ' ' {
p.removeRangeFromFormat(p.tzi+3, 1)
}
// If we have a full month name, update the format string to use it (can change length of format string)
if len(p.fullMonth) > 0 {
p.setFullMonth(p.fullMonth)
}
@ -3110,7 +3147,7 @@ func (p *parser) parse(originalLoc *time.Location, originalOpts ...ParserOption)
// get out of this function to reduce scope it needs to be applied on
if err != nil && strings.Contains(err.Error(), "month out of range") {
// simple optimized case where mm and dd can be swapped directly
if p.molen == 2 && p.daylen == 2 {
if p.molen == 2 && p.daylen == 2 && len(p.fullMonth) <= 0 && (p.tzi == 0 || (p.moi < p.tzi && p.dayi < p.tzi)) {
// skipped bytes have already been removed, so compensate for that
moi := p.moi - p.skip
p.moi = p.dayi - p.skip
@ -3144,17 +3181,10 @@ func (p *parser) parse(originalLoc *time.Location, originalOpts ...ParserOption)
}()
}
// Make sure that the entire string matched to a known format that was detected
if !p.allowPartialStringMatch && p.formatSetLen < len(p.format) {
// We can always ignore punctuation at the end of a date/time, but do not allow
// any numbers or letters in the format string.
validFormatTo := findProperEnd(bytesToString(p.format), p.formatSetLen, len(p.format), false, false, true)
if validFormatTo < len(p.format) {
return time.Time{}, p.unexpectedTail(p.formatSetLen)
}
if p.skip > len(p.format) {
p.skip = len(p.format)
}
if p.skip > 0 && len(p.format) > p.skip {
if p.skip > 0 {
// copy and then re-slice to shorten to avoid losing the header of the pooled format string
copy(p.format, p.format[p.skip:])
p.format = p.format[:len(p.format)-p.skip]

View File

@ -225,9 +225,11 @@ var testInputs = []dateTest{
{in: "Thu, 03 Jul 2017 8:08:04 +0100", out: "2017-07-03 07:08:04 +0000 UTC"},
{in: "Thu, 03 Jul 2017 8:8:4 +0100", out: "2017-07-03 07:08:04 +0000 UTC"},
//
{in: "Tue, 11 Jul 2017 04:08:03 +0200 (CEST)", out: "2017-07-11 02:08:03 +0000 UTC"},
{in: "Tue, 5 Jul 2017 04:08:03 -0700 (MST)", out: "2017-07-05 11:08:03 +0000 UTC"},
{in: "Tue, 11 Jul 2017 04:08:03 +0200 (CEST)", out: "2017-07-11 02:08:03 +0000 UTC", zname: "CEST"},
{in: "Tue, 5 Jul 2017 04:08:03 -0700 (MST)", out: "2017-07-05 11:08:03 +0000 UTC", zname: "MST"},
{in: "Tue, 11 Jul 2017 04:08:03 +0200 (CEST)", out: "2017-07-11 02:08:03 +0000 UTC", loc: "Europe/Berlin", zname: "CEST"},
{in: "Tue, 11 Jul 2017 04:08:03 (CEST)", out: "2017-07-11 04:08:03 +0000 UTC", zname: "CEST"},
{in: "Tue, 5 Jul 2017 04:08:03 (MST)", out: "2017-07-05 04:08:03 +0000 UTC", zname: "MST"},
// day, dd-Mon-yy hh:mm:zz TZ
{in: "Fri, 03-Jul-15 08:08:08 MST", out: "2015-07-03 08:08:08 +0000 UTC", zname: "MST"},
{in: "Fri, 03-Jul-15 08:08:08 CEST", out: "2015-07-03 08:08:08 +0000 UTC", zname: "CEST"},
@ -330,14 +332,18 @@ var testInputs = []dateTest{
{in: "04/02/2014 04:08:09 AM", out: "2014-04-02 04:08:09 +0000 UTC"},
{in: "04/02/2014 04:08:09AM PST", out: "2014-04-02 04:08:09 +0000 UTC", zname: "PST"},
{in: "04/02/2014 04:08:09 AM PST", out: "2014-04-02 04:08:09 +0000 UTC", zname: "PST"},
{in: "04/02/2014 04:08:09 AM (PST)", out: "2014-04-02 04:08:09 +0000 UTC", zname: "PST"},
{in: "04/02/2014 04:08:09AM CEST", out: "2014-04-02 04:08:09 +0000 UTC", zname: "CEST"},
{in: "04/02/2014 04:08:09 AM CEST", out: "2014-04-02 04:08:09 +0000 UTC", zname: "CEST"},
{in: "04/02/2014 04:08:09 AM (CEST)", out: "2014-04-02 04:08:09 +0000 UTC", zname: "CEST"},
{in: "04/02/2014 04:08:09pm", out: "2014-04-02 16:08:09 +0000 UTC"},
{in: "04/02/2014 04:08:09 PM", out: "2014-04-02 16:08:09 +0000 UTC"},
{in: "04/02/2014 04:08:09PM PST", out: "2014-04-02 16:08:09 +0000 UTC", zname: "PST"},
{in: "04/02/2014 04:08:09 PM PST", out: "2014-04-02 16:08:09 +0000 UTC", zname: "PST"},
{in: "04/02/2014 04:08:09 PM (PST)", out: "2014-04-02 16:08:09 +0000 UTC", zname: "PST"},
{in: "04/02/2014 04:08:09pm CEST", out: "2014-04-02 16:08:09 +0000 UTC", zname: "CEST"},
{in: "04/02/2014 04:08:09 PM CEST", out: "2014-04-02 16:08:09 +0000 UTC", zname: "CEST"},
{in: "04/02/2014 04:08:09 PM (CEST)", out: "2014-04-02 16:08:09 +0000 UTC", zname: "CEST"},
{in: "04/02/2014 04:08am", out: "2014-04-02 04:08:00 +0000 UTC"},
{in: "04/02/2014 04:08 AM", out: "2014-04-02 04:08:00 +0000 UTC"},
{in: "04/02/2014 04:08pm", out: "2014-04-02 16:08:00 +0000 UTC"},
@ -822,7 +828,7 @@ func TestParse(t *testing.T) {
}
fullInput := prefix + th.in
t.Run(fmt.Sprintf("simpleerr-%v-addweekday-%v-%s", simpleErrorMessage, addWeekday, fullInput), func(t *testing.T) {
t.Run(fmt.Sprintf("simpleerr-%v/addweekday-%v/%s", simpleErrorMessage, addWeekday, fullInput), func(t *testing.T) {
var ts time.Time
defer func() {
if r := recover(); r != nil {
@ -1167,6 +1173,9 @@ func TestInLocation(t *testing.T) {
ts = MustParse("Tue, 5 Jul 2017 16:28:13 -0700 (MST)")
assert.Equal(t, "2017-07-05 23:28:13 +0000 UTC", fmt.Sprintf("%v", ts.In(time.UTC)))
ts = MustParse("Tue, 5 Jul 2017 16:28:13 +0300 (CEST)")
assert.Equal(t, "2017-07-05 13:28:13 +0000 UTC", fmt.Sprintf("%v", ts.In(time.UTC)))
// Now we are going to use ParseIn() and see that it gives different answer
// with different zone, offset
time.Local = nil
@ -1311,6 +1320,6 @@ func TestRetryAmbiguousDateWithSwap(t *testing.T) {
// Convenience function for debugging a particular broken test case
func TestDebug(t *testing.T) {
ts := MustParse("Monday 19/03/2012 00:00:00", RetryAmbiguousDateWithSwap(true))
assert.Equal(t, "2012-03-19 00:00:00 +0000 UTC", fmt.Sprintf("%v", ts.In(time.UTC)))
ts := MustParse("September 17, 2012 at 10:09am CEST+02", RetryAmbiguousDateWithSwap(true))
assert.Equal(t, "2012-09-17 08:09:00 +0000 UTC", fmt.Sprintf("%v", ts.In(time.UTC)))
}